From f5c5d4c4f385f115d3a0f569c18e4b66106ca4ff Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 27 Oct 2022 10:57:07 +0800
Subject: [PATCH 001/209] init

---
 chunk_codegen.py     | 1047 ++++++++++++++++++++++++++++++++++++++++++
 chunk_codegen_run.py |  177 +++++++
 2 files changed, 1224 insertions(+)
 create mode 100644 chunk_codegen.py
 create mode 100644 chunk_codegen_run.py

diff --git a/chunk_codegen.py b/chunk_codegen.py
new file mode 100644
index 000000000000..684028c014de
--- /dev/null
+++ b/chunk_codegen.py
@@ -0,0 +1,1047 @@
+import colossalai
+import torch
+from typing import List, Callable, Any, Tuple, Dict, Iterable
+
+try:
+    from torch.fx.node import Node, Argument, map_arg, _type_repr, _get_qualified_name
+    from torch.fx.graph import _Namespace, PythonCode, _custom_builtins, _is_from_torch, _format_target, magic_methods, CodeGen, _origin_type_map, inplace_methods, _CustomBuiltin
+    CODEGEN_AVAILABLE = True
+except:
+    from torch.fx.graph import _Namespace, PythonCode, _custom_builtins, _is_from_torch, _format_target, magic_methods, _origin_type_map, _format_args, _CustomBuiltin
+    from torch.fx.node import Node, Argument, map_arg, _type_repr, _get_qualified_name
+    CODEGEN_AVAILABLE = False
+
+if CODEGEN_AVAILABLE:
+    __all__ = ['ActivationCheckpointCodeGen']
+else:
+    __all__ = ['python_code_with_activation_checkpoint']
+
+
+def _gen_saved_tensors_hooks():
+    """
+    Generate saved tensors hooks
+    """
+
+    pack_hook = """def pack_hook_input(self, x):
+    if getattr(x, "offload", False):
+        return (x.device, x.cpu())
+    else:
+        return x
+ 
+def pack_hook_no_input(self, x):
+    if getattr(x, "offload", True):
+        return (x.device, x.cpu())
+    else:
+        return x
+"""
+
+    unpack_hook = """def unpack_hook(self, packed):
+    if isinstance(packed, tuple):
+        device, tensor = packed
+        return tensor.to(device)
+    else:
+        return packed
+"""
+
+    return pack_hook, unpack_hook
+
+
+def _gen_save_tensors_hooks_context(offload_input=True) -> str:
+    """Generate customized saved_tensors_hooks
+
+    Args:
+        offload_input (bool, optional): whether we need offload input, if offload_input=False, 
+        we will use self.pack_hook_no_input instead. Defaults to True.
+
+    Returns:
+        str: generated context
+    """
+
+    if offload_input:
+        context = "with torch.autograd.graph.saved_tensors_hooks(self.pack_hook_input, self.unpack_hook):\n"
+    else:
+        context = "with torch.autograd.graph.saved_tensors_hooks(self.pack_hook_no_input, self.unpack_hook):\n"
+    return context
+
+
+def _gen_save_on_cpu_context():
+    """
+    Generate save on cpu context
+    """
+
+    context = "with torch.autograd.graph.save_on_cpu(pin_memory=True):\n"
+    return context
+
+
+def _find_input_and_output_nodes(nodes: List[Node]):
+    """
+    Find the input and output node names which are not found in the given list of nodes.
+    """
+    input_nodes = []
+    output_nodes = []
+
+    # if a node has an input node which is not in the node list
+    # we treat that input node as the input of the checkpoint function
+    for node in nodes:
+        for input_node in node._input_nodes.keys():
+            node_repr = repr(input_node)
+            if input_node not in nodes and node_repr not in input_nodes:
+                input_nodes.append(node_repr)
+
+    # if a node has a user node which is not in the node list
+    # we treat that user node as the node receiving the current node output
+    for node in nodes:
+        for output_node in node.users.keys():
+            node_repr = repr(node)
+            if output_node not in nodes and node_repr not in output_nodes:
+                output_nodes.append(node_repr)
+
+    return input_nodes, output_nodes
+
+
+def _find_ckpt_regions(nodes: List[Node]):
+    """
+    Find the checkpoint regions given a list of consecutive nodes. The outputs will be list
+    of tuples, each tuple is in the form of (start_index, end_index).
+    """
+    ckpt_nodes = []
+    ckpt_regions = []
+    start = -1
+    end = -1
+    current_region = None
+
+    for idx, node in enumerate(nodes):
+        if hasattr(node, 'activation_checkpoint'):
+            act_ckpt_label = node.activation_checkpoint
+
+            # this activation checkpoint label is not set yet
+            # meaning this is the first node of the activation ckpt region
+            if current_region is None:
+                current_region = act_ckpt_label
+                start = idx
+
+            # if activation checkpoint has changed
+            # we restart the tracking
+            # e.g. node ckpt states = [ckpt1, ckpt2, ckpt2, ckpt2]
+            if act_ckpt_label != current_region:
+                assert start != -1
+                ckpt_regions.append((start, idx - 1))
+                current_region = act_ckpt_label
+                start = idx
+                end = -1
+        elif current_region is not None and not hasattr(node, 'activation_checkpoint'):
+            # used to check the case below
+            # node ckpt states = [ckpt, ckpt, non-ckpt]
+            end = idx - 1
+            assert start != -1 and end != -1
+            ckpt_regions.append((start, end))
+            start = end = -1
+            current_region = None
+        else:
+            pass
+    return ckpt_regions
+
+
+def _find_offload_regions(nodes: List[Node]):
+    """This function is to find the offload regions
+    In pofo algorithm, during annotation, we will annotate the offload region with the 
+    list in the form of [idx, offload_input, offload_bar]. idx indicates the offload
+    region's index, offload_input is a bool type indicates whether we need to offload
+    the input, offload_bar is a bool type indicates whether we need to offload all the
+    intermediate x_bars of this region.
+    """
+    offload_regions = []
+    offload_labels = []
+    start = -1
+    end = -1
+    current_region = None
+
+    for idx, node in enumerate(nodes):
+        if hasattr(node, 'activation_offload') and isinstance(getattr(node, 'activation_offload', None), Iterable):
+            act_offload_label = node.activation_offload
+
+            if current_region == None:
+                current_region = act_offload_label
+                start = idx
+                offload_labels.append(act_offload_label)
+
+            if act_offload_label != current_region:
+                assert start != -1
+                offload_regions.append((start, idx - 1))
+                offload_labels.append(act_offload_label)
+                current_region = act_offload_label
+                start = idx
+                end = -1
+
+        else:
+            if current_region is not None:
+                end = idx - 1
+                assert start != -1 and end != -1
+                offload_regions.append((start, end))
+                start = end = -1
+                current_region = None
+
+            else:
+                pass
+
+    return offload_regions, offload_labels
+
+
+def _gen_ckpt_fn_def(label, free_vars: List[str]) -> str:
+    """
+    Generate the checkpoint function definition
+    """
+    return f"def checkpoint_{label}({', '.join(['self'] + free_vars)}):"
+
+
+def _gen_ckpt_output(output_vars: List[str]) -> str:
+    """
+    Generate the return statement for checkpoint region
+    """
+    return f"return {', '.join(output_vars)}"
+
+
+def _gen_ckpt_usage(label, activation_offload, input_vars, output_vars, use_reentrant=True):
+    """
+    Generate the checkpoint function call code text
+    """
+    outputs = ', '.join(output_vars)
+    inputs = ', '.join(input_vars)
+    return f'{outputs} = colossalai.utils.activation_checkpoint.checkpoint(self.checkpoint_{label}, {activation_offload}, {inputs}, use_reentrant={use_reentrant})'
+
+
+def _end_of_ckpt(node: Node, check_idx: int) -> bool:
+    """Check if the node could end the ckpt region
+
+    Args:
+        node (Node): torch.fx.Node
+        check_idx (int): the index of checkpoint level for 
+        nested checkpoint
+
+    Returns:
+        bool
+    """
+    if hasattr(node, "activation_checkpoint"):
+        if isinstance(node.activation_checkpoint, list):
+            return node.activation_checkpoint[check_idx] == None
+        else:
+            return False
+    else:
+        return True
+
+
+def _find_nested_ckpt_regions(nodes, check_idx=0):
+    """
+    Find the nested checkpoint regions given a list of consecutive nodes. The outputs 
+    will be list of tuples, each tuple is in the form of (start_index, end_index).
+    """
+    ckpt_regions = []
+    start = -1
+    end = -1
+    current_region = None
+
+    for idx, node in enumerate(nodes):
+        if hasattr(node, 'activation_checkpoint'):
+            if isinstance(getattr(node, 'activation_checkpoint'), int):
+                act_ckpt_label = node.activation_checkpoint
+            else:
+                act_ckpt_label = node.activation_checkpoint[check_idx]
+
+            # this activation checkpoint label is not set yet
+            # meaning this is the first node of the activation ckpt region
+            if current_region is None:
+                current_region = act_ckpt_label
+                start = idx
+
+            # if activation checkpoint has changed
+            # we restart the tracking
+            # e.g. node ckpt states = [ckpt1, ckpt2, ckpt2, ckpt2]
+            if act_ckpt_label != current_region:
+                assert start != -1
+                ckpt_regions.append((start, idx - 1))
+                current_region = act_ckpt_label
+                start = idx
+                end = -1
+        elif current_region is not None and _end_of_ckpt(node, check_idx):
+            # used to check the case below
+            # node ckpt states = [ckpt, ckpt, non-ckpt]
+            end = idx - 1
+            assert start != -1 and end != -1
+            ckpt_regions.append((start, end))
+            start = end = -1
+            current_region = None
+        else:
+            pass
+
+    if current_region is not None:
+        end = len(nodes) - 1
+        ckpt_regions.append((start, end))
+    return ckpt_regions
+
+
+def emit_ckpt_func(body,
+                   ckpt_func,
+                   node_list: List[Node],
+                   emit_node_func,
+                   delete_unused_value_func,
+                   level=0,
+                   in_ckpt=False):
+    """Emit ckpt fuction in nested way
+
+    Args:
+        body: forward code, in recursive calls, this part will be checkpoint
+        functions code
+        ckpt_func: checkpoint functions code, in recursive calls, this part
+        will be a buffer
+        node_list (List[Node]): list of torch.fx.Node
+        emit_node_func: function to emit a node
+        delete_unused_value_func: function to delete unused value
+        level (int, optional): checkpoint level. Defaults to 0.
+        in_ckpt (bool, optional): indicates wether the func is in recursive
+        call. Defaults to False.
+    """
+    inputs, outputs = _find_input_and_output_nodes(node_list)
+
+    # if the current checkpoint function use int as label, using old generation method
+    if isinstance(node_list[0].activation_checkpoint, int):
+        label = node_list[0].activation_checkpoint
+        ckpt_fn_def = _gen_ckpt_fn_def(label, inputs)
+        ckpt_func.append(f'{ckpt_fn_def}\n')
+        for node in node_list:
+            emit_node_func(node, ckpt_func)
+            ckpt_func[-1] = '    ' + ckpt_func[-1]
+            delete_unused_value_func(node, ckpt_func)
+
+        ckpt_func.append('    ' + _gen_ckpt_output(outputs) + '\n\n')
+        activation_offload = getattr(node_list[0], "activation_offload", False)
+        usage = _gen_ckpt_usage(label, activation_offload, inputs, outputs, False)
+        usage += "\n"
+        body.append(usage)
+
+    # use nested ckpt function codegen
+    else:
+        # label given by each layer, e.g. if you are currently at level [0, 1, 1]
+        # the label will be '0_1_1'
+        label = "_".join([str(idx) for idx in node_list[0].activation_checkpoint[:level + 1]])
+        ckpt_fn_def = _gen_ckpt_fn_def(label, inputs)
+        ckpt_func.append(f'{ckpt_fn_def}\n')
+
+        # if there is more level to fetch
+        if level + 1 < len(node_list[0].activation_checkpoint):
+            ckpt_regions = _find_nested_ckpt_regions(node_list, level + 1)
+            start_idx = [item[0] for item in ckpt_regions]
+            end_idx = [item[1] for item in ckpt_regions]
+
+            # use ckpt_func_buffer to store nested checkpoint functions
+            ckpt_func_buffer = []
+            node_idx = 0
+            while 1:
+                if node_idx >= len(node_list):
+                    break
+
+                if node_idx in start_idx:
+                    ckpt_node_list = node_list[node_idx:end_idx[start_idx.index(node_idx)] + 1]
+                    emit_ckpt_func(ckpt_func, ckpt_func_buffer, ckpt_node_list, emit_node_func,
+                                   delete_unused_value_func, level + 1, True)
+                    node_idx += len(ckpt_node_list)
+
+                else:
+                    node = node_list[node_idx]
+                    emit_node_func(node, ckpt_func)
+                    ckpt_func[-1] = '    ' + ckpt_func[-1]
+                    delete_unused_value_func(node, ckpt_func)
+                    node_idx += 1
+
+            ckpt_func.append('    ' + _gen_ckpt_output(outputs) + '\n\n')
+            ckpt_func += ckpt_func_buffer
+            activation_offload = getattr(node_list[0], "activation_offload", False)
+            usage = _gen_ckpt_usage(label, activation_offload, inputs, outputs, False) + '\n'
+            if in_ckpt:
+                usage = '    ' + usage
+            body.append(usage)
+
+        # last level
+        else:
+            for node in node_list:
+                emit_node_func(node, ckpt_func)
+                ckpt_func[-1] = '    ' + ckpt_func[-1]
+                delete_unused_value_func(node, ckpt_func)
+
+            ckpt_func.append('    ' + _gen_ckpt_output(outputs) + '\n\n')
+            activation_offload = getattr(node_list[0], "activation_offload", False)
+            usage = _gen_ckpt_usage(label, activation_offload, inputs, outputs, False) + '\n'
+            if in_ckpt:
+                usage = '    ' + usage
+            body.append(usage)
+
+
+def emit_code_with_nested_activation_checkpoint(body, ckpt_func, nodes, emit_node_func, delete_unused_value_func):
+    """Emit code with nested activation checkpoint
+    When we detect some of the node.activation_checkpoint is a List, we will use
+    this function to emit the activation checkpoint codes.
+
+    Args:
+        body: forward code
+        ckpt_func: checkpoint functions code
+        nodes: graph.nodes
+        emit_node_func: function to emit node
+        delete_unused_value_func: function to remove the unused value
+    """
+    ckpt_regions = _find_nested_ckpt_regions(nodes, 0)
+    start_idx = [item[0] for item in ckpt_regions]
+    end_idx = [item[1] for item in ckpt_regions]
+
+    # find the offload regions
+    offload_regions, offload_labels = _find_offload_regions(nodes)
+    offload_starts = [item[0] for item in offload_regions]
+    offload_ends = [item[1] for item in offload_regions]
+    offload_inputs = []
+    offload_outputs = []
+    within_offload_region = False
+
+    node_list = list(nodes)
+
+    # find the input and output var names for each offload region
+    for idx, (start, end) in enumerate(offload_regions):
+        offload_node_list = node_list[start:end + 1]
+        inputs, outputs = _find_input_and_output_nodes(offload_node_list)
+        offload_inputs.append(inputs)
+        offload_outputs.append(outputs)
+
+    # this flag is to prevent repeated insert of save tensors
+    # hooks definition in ckpt_func
+    is_hook_inserted = False
+    node_idx = 0
+    while 1:
+        # break if we finish the processing all the nodes
+        if node_idx >= len(node_list):
+            break
+
+        # process ckpt_regions
+        if node_idx in start_idx:
+            ckpt_node_list = node_list[node_idx:end_idx[start_idx.index(node_idx)] + 1]
+            emit_ckpt_func(body, ckpt_func, ckpt_node_list, emit_node_func, delete_unused_value_func)
+            node_idx += len(ckpt_node_list)
+
+        # process node in forward function
+        else:
+            node = node_list[node_idx]
+
+            if node_idx in offload_starts:
+                offload_label = offload_labels[offload_starts.index(node_idx)]
+                _, offload_input, offload_bar = offload_label
+                within_offload_region = True
+
+                # insert hook functions if needed
+                if not is_hook_inserted:
+                    pack_hook, unpack_hook = _gen_saved_tensors_hooks()
+                    ckpt_func.insert(0, "\n".join([pack_hook, unpack_hook]) + "\n")
+                    is_hook_inserted = True
+
+                if offload_input and offload_bar:
+                    body.append(_gen_save_on_cpu_context())
+
+                elif offload_input:
+                    for par in offload_inputs[offload_label[0]]:
+                        body.append(f"setattr({par}, 'offload', True)\n")
+                    body.append(_gen_save_tensors_hooks_context(offload_input=True))
+
+                else:
+                    for par in offload_inputs[offload_label[0]]:
+                        body.append(f"setattr({par}, 'offload', False)\n")
+                    body.append(_gen_save_tensors_hooks_context(offload_input=False))
+
+            if within_offload_region:
+                emit_node_func(node, body)
+                body[-1] = '    ' + body[-1]
+                delete_unused_value_func(node, body)
+
+            else:
+                emit_node_func(node, body)
+                delete_unused_value_func(node, body)
+
+            if node_idx in offload_ends:
+                within_offload_region = False
+
+            node_idx += 1
+
+
+def emit_code_with_activation_checkpoint(body, ckpt_func, nodes, emit_node_func, delete_unused_value_func):
+    # find the activation checkpoint regions
+    ckpt_regions = _find_ckpt_regions(nodes)
+    start_idx = [item[0] for item in ckpt_regions]
+    end_idx = [item[1] for item in ckpt_regions]
+    input_vars = []
+    output_vars = []
+    within_ckpt_region = False
+
+    # find the offload regions
+    offload_regions, offload_labels = _find_offload_regions(nodes)
+    offload_starts = [item[0] for item in offload_regions]
+    offload_ends = [item[1] for item in offload_regions]
+    offload_inputs = []
+    offload_outputs = []
+    within_offload_region = False
+
+    node_list = list(nodes)
+
+    # use this variable to avoid inserting hook functions
+    # to ckpt_func repeatedly
+    is_hook_inserted = False
+
+    # find the input and output var names for each region
+    for idx, (start, end) in enumerate(ckpt_regions):
+        ckpt_node_list = node_list[start:end + 1]
+        inputs, outputs = _find_input_and_output_nodes(ckpt_node_list)
+        input_vars.append(inputs)
+        output_vars.append(outputs)
+
+    # find the input and output var names for each offload region
+    for idx, (start, end) in enumerate(offload_regions):
+        offload_node_list = node_list[start:end + 1]
+        inputs, outputs = _find_input_and_output_nodes(offload_node_list)
+        offload_inputs.append(inputs)
+        offload_outputs.append(outputs)
+
+    # append code text to body
+    for idx, node in enumerate(node_list):
+        # if this is the first node of the ckpt region
+        # append the ckpt function defition
+        if idx in start_idx:
+            label = start_idx.index(idx)
+            ckpt_fn_def = _gen_ckpt_fn_def(label, input_vars[label])
+            ckpt_func.append(f'{ckpt_fn_def}\n')
+            within_ckpt_region = True
+
+        if idx in offload_starts:
+            offload_label = offload_labels[offload_starts.index(idx)]
+            _, offload_input, offload_bar = offload_label
+            within_offload_region = True
+
+            # insert hook functions if needed
+            if not is_hook_inserted:
+                pack_hook, unpack_hook = _gen_saved_tensors_hooks()
+                ckpt_func.insert(0, "\n".join([pack_hook, unpack_hook]) + "\n")
+                is_hook_inserted = True
+
+            if offload_input and offload_bar:
+                body.append(_gen_save_on_cpu_context())
+
+            elif offload_input:
+                for par in offload_inputs[offload_label[0]]:
+                    body.append(f"setattr({par}, 'offload', True)\n")
+                body.append(_gen_save_tensors_hooks_context(offload_input=True))
+
+            else:
+                for par in offload_inputs[offload_label[0]]:
+                    body.append(f"setattr({par}, 'offload', False)\n")
+                body.append(_gen_save_tensors_hooks_context(offload_input=False))
+
+        # NOTE: emit_node does not emit a string with newline. It depends
+        # on delete_unused_values to append one
+        # NOTE: currently we separate body and ckpt_func definition
+        if within_ckpt_region:
+            emit_node_func(node, ckpt_func)
+            ckpt_func[-1] = '    ' + ckpt_func[-1]
+            delete_unused_value_func(node, ckpt_func)
+
+        elif within_offload_region:
+            emit_node_func(node, body)
+            body[-1] = '    ' + body[-1]
+            delete_unused_value_func(node, body)
+
+        else:
+            emit_node_func(node, body)
+            delete_unused_value_func(node, body)
+
+        if idx in end_idx:
+            # if this is the last node of the ckpt region
+            # generate return statement
+            label = end_idx.index(idx)
+            return_statement = _gen_ckpt_output(output_vars[label])
+            return_statement = f'    {return_statement}\n\n'
+            ckpt_func.append(return_statement)
+
+            # we need to check if the checkpoint need to offload the input
+            start_node_idx = start_idx[label]
+            if hasattr(node_list[start_node_idx], 'activation_offload'):
+                activation_offload = node_list[start_node_idx].activation_offload
+            else:
+                activation_offload = False
+
+            # we need to check if the checkpoint need use_reentrant=False
+            use_reentrant = True
+            non_leaf_input = 0
+            for var in input_vars[label]:
+                input_node = next(item for item in node_list if item.name == var)
+                if input_node.op != "placeholder":
+                    non_leaf_input = 1
+                for user in input_node.users:
+                    if hasattr(user, "activation_checkpoint"):
+                        if user.activation_checkpoint == label:
+                            if user.op == "call_module":
+                                if hasattr(user.graph.owning_module.get_submodule(user.target), "inplace"):
+                                    use_reentrant = not user.graph.owning_module.get_submodule(user.target).inplace
+
+                            elif user.op == "call_function":
+                                if "inplace" in user.kwargs:
+                                    use_reentrant = not user.kwargs["inplace"]
+
+            # if all the inputs are leaf nodes, we need to set use_reentrant = False
+            if not non_leaf_input:
+                use_reentrant = False
+
+            # generate checkpoint function call in a new line
+            usage = _gen_ckpt_usage(label, activation_offload, input_vars[label], output_vars[label], use_reentrant)
+            usage += '\n'
+            body.append(usage)
+            within_ckpt_region = False
+
+        if idx in offload_ends:
+            within_offload_region = False
+
+
+if CODEGEN_AVAILABLE:
+
+    class ActivationCheckpointCodeGen(CodeGen):
+
+        def _gen_python_code(self, nodes, root_module: str, namespace: _Namespace) -> PythonCode:
+            free_vars: List[str] = []
+            body: List[str] = []
+            globals_: Dict[str, Any] = {}
+            wrapped_fns: Dict[str, None] = {}
+
+            # Wrap string in list to pass by reference
+            maybe_return_annotation: List[str] = ['']
+
+            def add_global(name_hint: str, obj: Any):
+                """Add an obj to be tracked as a global.
+
+                We call this for names that reference objects external to the
+                Graph, like functions or types.
+
+                Returns: the global name that should be used to reference 'obj' in generated source.
+                """
+                if _is_from_torch(obj) and obj != torch.device:    # to support registering torch.device
+                    # HACK: workaround for how torch custom ops are registered. We
+                    # can't import them like normal modules so they must retain their
+                    # fully qualified name.
+                    return _get_qualified_name(obj)
+
+                # normalize the name hint to get a proper identifier
+                global_name = namespace.create_name(name_hint, obj)
+
+                if global_name in globals_:
+                    assert globals_[global_name] is obj
+                    return global_name
+                globals_[global_name] = obj
+                return global_name
+
+            # set _custom_builtins here so that we needn't import colossalai in forward
+            _custom_builtins["colossalai"] = _CustomBuiltin("import colossalai", colossalai)
+
+            # Pre-fill the globals table with registered builtins.
+            for name, (_, obj) in _custom_builtins.items():
+                add_global(name, obj)
+
+            def type_repr(o: Any):
+                if o == ():
+                    # Empty tuple is used for empty tuple type annotation Tuple[()]
+                    return '()'
+
+                typename = _type_repr(o)
+
+                if hasattr(o, '__origin__'):
+                    # This is a generic type, e.g. typing.List[torch.Tensor]
+                    origin_type = _origin_type_map.get(o.__origin__, o.__origin__)
+                    origin_typename = add_global(_type_repr(origin_type), origin_type)
+
+                    if hasattr(o, '__args__'):
+                        # Assign global names for each of the inner type variables.
+                        args = [type_repr(arg) for arg in o.__args__]
+
+                        if len(args) == 0:
+                            # Bare type, such as `typing.Tuple` with no subscript
+                            # This code-path used in Python < 3.9
+                            return origin_typename
+
+                        return f'{origin_typename}[{",".join(args)}]'
+                    else:
+                        # Bare type, such as `typing.Tuple` with no subscript
+                        # This code-path used in Python 3.9+
+                        return origin_typename
+
+                # Common case: this is a regular module name like 'foo.bar.baz'
+                return add_global(typename, o)
+
+            def _format_args(args: Tuple[Argument, ...], kwargs: Dict[str, Argument]) -> str:
+
+                def _get_repr(arg):
+                    # Handle NamedTuples (if it has `_fields`) via add_global.
+                    if isinstance(arg, tuple) and hasattr(arg, '_fields'):
+                        qualified_name = _get_qualified_name(type(arg))
+                        global_name = add_global(qualified_name, type(arg))
+                        return f"{global_name}{repr(tuple(arg))}"
+                    return repr(arg)
+
+                args_s = ', '.join(_get_repr(a) for a in args)
+                kwargs_s = ', '.join(f'{k} = {_get_repr(v)}' for k, v in kwargs.items())
+                if args_s and kwargs_s:
+                    return f'{args_s}, {kwargs_s}'
+                return args_s or kwargs_s
+
+            # Run through reverse nodes and record the first instance of a use
+            # of a given node. This represents the *last* use of the node in the
+            # execution order of the program, which we will use to free unused
+            # values
+            node_to_last_use: Dict[Node, Node] = {}
+            user_to_last_uses: Dict[Node, List[Node]] = {}
+
+            def register_last_uses(n: Node, user: Node):
+                if n not in node_to_last_use:
+                    node_to_last_use[n] = user
+                    user_to_last_uses.setdefault(user, []).append(n)
+
+            for node in reversed(nodes):
+                map_arg(node.args, lambda n: register_last_uses(n, node))
+                map_arg(node.kwargs, lambda n: register_last_uses(n, node))
+
+            # NOTE: we add a variable to distinguish body and ckpt_func
+            def delete_unused_values(user: Node, body):
+                """
+                Delete values after their last use. This ensures that values that are
+                not used in the remainder of the code are freed and the memory usage
+                of the code is optimal.
+                """
+                if user.op == 'placeholder':
+                    return
+                if user.op == 'output':
+                    body.append('\n')
+                    return
+                nodes_to_delete = user_to_last_uses.get(user, [])
+                if len(nodes_to_delete):
+                    to_delete_str = ' = '.join([repr(n) for n in nodes_to_delete] + ['None'])
+                    body.append(f';  {to_delete_str}\n')
+                else:
+                    body.append('\n')
+
+            # NOTE: we add a variable to distinguish body and ckpt_func
+            def emit_node(node: Node, body):
+                maybe_type_annotation = '' if node.type is None else f' : {type_repr(node.type)}'
+                if node.op == 'placeholder':
+                    assert isinstance(node.target, str)
+                    maybe_default_arg = '' if not node.args else f' = {repr(node.args[0])}'
+                    free_vars.append(f'{node.target}{maybe_type_annotation}{maybe_default_arg}')
+                    raw_name = node.target.replace('*', '')
+                    if raw_name != repr(node):
+                        body.append(f'{repr(node)} = {raw_name}\n')
+                    return
+                elif node.op == 'call_method':
+                    assert isinstance(node.target, str)
+                    body.append(
+                        f'{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}'
+                        f'({_format_args(node.args[1:], node.kwargs)})')
+                    return
+                elif node.op == 'call_function':
+                    assert callable(node.target)
+                    # pretty print operators
+                    if node.target.__module__ == '_operator' and node.target.__name__ in magic_methods:
+                        assert isinstance(node.args, tuple)
+                        body.append(f'{repr(node)}{maybe_type_annotation} = '
+                                    f'{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}')
+                        return
+
+                    # pretty print inplace operators; required for jit.script to work properly
+                    # not currently supported in normal FX graphs, but generated by torchdynamo
+                    if node.target.__module__ == '_operator' and node.target.__name__ in inplace_methods:
+                        body.append(f'{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  '
+                                    f'{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}')
+                        return
+
+                    qualified_name = _get_qualified_name(node.target)
+                    global_name = add_global(qualified_name, node.target)
+                    # special case for getattr: node.args could be 2-argument or 3-argument
+                    # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
+                    if global_name == 'getattr' and \
+                    isinstance(node.args, tuple) and \
+                    isinstance(node.args[1], str) and \
+                    node.args[1].isidentifier() and \
+                    len(node.args) == 2:
+                        body.append(
+                            f'{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}')
+                        return
+                    body.append(
+                        f'{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})')
+                    if node.meta.get('is_wrapped', False):
+                        wrapped_fns.setdefault(global_name)
+                    return
+                elif node.op == 'call_module':
+                    assert isinstance(node.target, str)
+                    body.append(f'{repr(node)}{maybe_type_annotation} = '
+                                f'{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})')
+                    return
+                elif node.op == 'get_attr':
+                    assert isinstance(node.target, str)
+                    body.append(f'{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}')
+                    return
+                elif node.op == 'output':
+                    if node.type is not None:
+                        maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
+                    body.append(self.generate_output(node.args[0]))
+                    return
+                raise NotImplementedError(f'node: {node.op} {node.target}')
+
+            # Modified for activation checkpointing
+            ckpt_func = []
+
+            # if any node has a list of labels for activation_checkpoint, we
+            # will use nested type of activation checkpoint codegen
+            if any(isinstance(getattr(node, "activation_checkpoint", None), Iterable) for node in nodes):
+                emit_code_with_nested_activation_checkpoint(body, ckpt_func, nodes, emit_node, delete_unused_values)
+            else:
+                emit_code_with_activation_checkpoint(body, ckpt_func, nodes, emit_node, delete_unused_values)
+
+            if len(body) == 0:
+                # If the Graph has no non-placeholder nodes, no lines for the body
+                # have been emitted. To continue to have valid Python code, emit a
+                # single pass statement
+                body.append('pass\n')
+
+            if len(wrapped_fns) > 0:
+                wrap_name = add_global('wrap', torch.fx.wrap)
+                wrap_stmts = '\n'.join([f'{wrap_name}("{name}")' for name in wrapped_fns])
+            else:
+                wrap_stmts = ''
+
+            if self._body_transformer:
+                body = self._body_transformer(body)
+
+            for name, value in self.additional_globals():
+                add_global(name, value)
+
+            # as we need colossalai.utils.checkpoint, we need to import colossalai
+            # in forward function
+            prologue = self.gen_fn_def(free_vars, maybe_return_annotation[0])
+            prologue = ''.join(ckpt_func) + prologue
+            prologue = prologue
+
+            code = ''.join(body)
+            code = '\n'.join('    ' + line for line in code.split('\n'))
+            fn_code = f"""
+{wrap_stmts}
+
+{prologue}
+{code}"""
+            return PythonCode(fn_code, globals_)
+
+else:
+
+    def python_code_with_activation_checkpoint(self, root_module: str, namespace: _Namespace) -> PythonCode:
+        """
+        This method is copied from the _python_code of torch.fx.graph.Graph. Modifications are made so that it can generate
+        code for activation checkpoint.
+        """
+        free_vars: List[str] = []
+        body: List[str] = []
+        globals_: Dict[str, Any] = {}
+        wrapped_fns: Dict[str, None] = {}
+
+        # Wrap string in list to pass by reference
+        maybe_return_annotation: List[str] = ['']
+
+        def add_global(name_hint: str, obj: Any):
+            """Add an obj to be tracked as a global.
+
+            We call this for names that reference objects external to the
+            Graph, like functions or types.
+
+            Returns: the global name that should be used to reference 'obj' in generated source.
+            """
+            if _is_from_torch(obj) and obj != torch.device:    # to support registering torch.device
+                # HACK: workaround for how torch custom ops are registered. We
+                # can't import them like normal modules so they must retain their
+                # fully qualified name.
+                return _get_qualified_name(obj)
+
+            # normalize the name hint to get a proper identifier
+            global_name = namespace.create_name(name_hint, obj)
+
+            if global_name in globals_:
+                assert globals_[global_name] is obj
+                return global_name
+            globals_[global_name] = obj
+            return global_name
+
+        # set _custom_builtins here so that we needn't import colossalai in forward
+        _custom_builtins["colossalai"] = _CustomBuiltin("import colossalai", colossalai)
+
+        # Pre-fill the globals table with registered builtins.
+        for name, (_, obj) in _custom_builtins.items():
+            add_global(name, obj)
+
+        def type_repr(o: Any):
+            if o == ():
+                # Empty tuple is used for empty tuple type annotation Tuple[()]
+                return '()'
+
+            typename = _type_repr(o)
+
+            # This is a generic type, e.g. typing.List[torch.Tensor]
+            if hasattr(o, '__origin__'):
+                origin_type = _origin_type_map.get(o.__origin__, o.__origin__)
+                origin_typename = add_global(_type_repr(origin_type), origin_type)
+
+                # Assign global names for each of the inner type variables.
+                args = [type_repr(arg) for arg in o.__args__]
+
+                return f'{origin_typename}[{",".join(args)}]'
+
+            # Common case: this is a regular module name like 'foo.bar.baz'
+            return add_global(typename, o)
+
+        # Run through reverse nodes and record the first instance of a use
+        # of a given node. This represents the *last* use of the node in the
+        # execution order of the program, which we will use to free unused
+        # values
+        node_to_last_use: Dict[Node, Node] = {}
+        user_to_last_uses: Dict[Node, List[Node]] = {}
+
+        def register_last_uses(n: Node, user: Node):
+            if n not in node_to_last_use:
+                node_to_last_use[n] = user
+                user_to_last_uses.setdefault(user, []).append(n)
+
+        for node in reversed(self.nodes):
+            map_arg(node.args, lambda n: register_last_uses(n, node))
+            map_arg(node.kwargs, lambda n: register_last_uses(n, node))
+
+        # NOTE: we add a variable to distinguish body and ckpt_func
+        def delete_unused_values(user: Node, body):
+            """
+            Delete values after their last use. This ensures that values that are
+            not used in the remainder of the code are freed and the memory usage
+            of the code is optimal.
+            """
+            if user.op == 'placeholder':
+                return
+            if user.op == 'output':
+                body.append('\n')
+                return
+            nodes_to_delete = user_to_last_uses.get(user, [])
+            if len(nodes_to_delete):
+                to_delete_str = ' = '.join([repr(n) for n in nodes_to_delete] + ['None'])
+                body.append(f';  {to_delete_str}\n')
+            else:
+                body.append('\n')
+
+        # NOTE: we add a variable to distinguish body and ckpt_func
+        def emit_node(node: Node, body):
+            maybe_type_annotation = '' if node.type is None else f' : {type_repr(node.type)}'
+            if node.op == 'placeholder':
+                assert isinstance(node.target, str)
+                maybe_default_arg = '' if not node.args else f' = {repr(node.args[0])}'
+                free_vars.append(f'{node.target}{maybe_type_annotation}{maybe_default_arg}')
+                raw_name = node.target.replace('*', '')
+                if raw_name != repr(node):
+                    body.append(f'{repr(node)} = {raw_name}\n')
+                return
+            elif node.op == 'call_method':
+                assert isinstance(node.target, str)
+                body.append(f'{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}'
+                            f'({_format_args(node.args[1:], node.kwargs)})')
+                return
+            elif node.op == 'call_function':
+                assert callable(node.target)
+                # pretty print operators
+                if node.target.__module__ == '_operator' and node.target.__name__ in magic_methods:
+                    assert isinstance(node.args, tuple)
+                    body.append(f'{repr(node)}{maybe_type_annotation} = '
+                                f'{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}')
+                    return
+                qualified_name = _get_qualified_name(node.target)
+                global_name = add_global(qualified_name, node.target)
+                # special case for getattr: node.args could be 2-argument or 3-argument
+                # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
+                if global_name == 'getattr' and \
+                   isinstance(node.args, tuple) and \
+                   isinstance(node.args[1], str) and \
+                   node.args[1].isidentifier() and \
+                   len(node.args) == 2:
+                    body.append(
+                        f'{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}')
+                    return
+                body.append(
+                    f'{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})')
+                if node.meta.get('is_wrapped', False):
+                    wrapped_fns.setdefault(global_name)
+                return
+            elif node.op == 'call_module':
+                assert isinstance(node.target, str)
+                body.append(f'{repr(node)}{maybe_type_annotation} = '
+                            f'{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})')
+                return
+            elif node.op == 'get_attr':
+                assert isinstance(node.target, str)
+                body.append(f'{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}')
+                return
+            elif node.op == 'output':
+                if node.type is not None:
+                    maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
+                if self._pytree_info is None:
+                    body.append(f'return {repr(node.args[0])}')
+                else:
+                    body.append(f'return pytree.tree_unflatten({repr(node.args[0])}, self._out_spec)')
+                return
+            raise NotImplementedError(f'node: {node.op} {node.target}')
+
+        # Modified for activation checkpointing
+        ckpt_func = []
+
+        # if any node has a list of labels for activation_checkpoint, we
+        # will use nested type of activation checkpoint codegen
+        if any(isinstance(getattr(node, "activation_checkpoint", None), Iterable) for node in self.nodes):
+            emit_code_with_nested_activation_checkpoint(body, ckpt_func, self.nodes, emit_node, delete_unused_values)
+        else:
+            emit_code_with_activation_checkpoint(body, ckpt_func, self.nodes, emit_node, delete_unused_values)
+
+        if len(body) == 0:
+            # If the Graph has no non-placeholder nodes, no lines for the body
+            # have been emitted. To continue to have valid Python code, emit a
+            # single pass statement
+            body.append('pass\n')
+        if self._pytree_info is not None:
+            orig_args = self._pytree_info.orig_args
+            has_orig_self = (orig_args[0] == 'self')
+            if has_orig_self:
+                free_vars.insert(0, 'self')
+            if len(free_vars) > 0:    # pytree has placeholders in it
+                body.insert(
+                    0,
+                    f"{', '.join(free_vars)}, = fx_pytree.tree_flatten_spec([{', '.join(orig_args)}], self._in_spec)\n")
+        else:
+            orig_args = free_vars
+
+        if len(wrapped_fns) > 0:
+            wrap_name = add_global('wrap', torch.fx.wrap)
+            wrap_stmts = '\n'.join([f'{wrap_name}("{name}")' for name in wrapped_fns])
+        else:
+            wrap_stmts = ''
+
+        ckpt_func = ''.join(ckpt_func)
+
+        # If the original function didn't have self as its first argument, we
+        # would have added it.
+        if len(orig_args) == 0 or orig_args[0] != 'self':
+            orig_args.insert(0, 'self')
+        code = ''.join(body)
+        code = '\n'.join('    ' + line for line in code.split('\n'))
+
+        # as we need colossalai.utils.checkpoint, we need to import colossalai
+        # in forward function
+        fn_code = f"""
+{wrap_stmts}
+
+{ckpt_func}
+def forward({', '.join(orig_args)}){maybe_return_annotation[0]}:
+{code}"""
+        return PythonCode(fn_code, globals_)
diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
new file mode 100644
index 000000000000..9ac399a29b51
--- /dev/null
+++ b/chunk_codegen_run.py
@@ -0,0 +1,177 @@
+import copy
+import torch
+import torch.nn.functional as F
+import pytest
+import torch.multiprocessing as mp
+from torch.fx import GraphModule
+from colossalai.fx import ColoTracer
+import colossalai
+from colossalai.utils import free_port
+from colossalai.core import global_context as gpc
+from colossalai.fx.graph_module import ColoGraphModule
+
+try:
+    from chunk_codegen import ActivationCheckpointCodeGen
+    with_codegen = True
+except:
+    # fall back to older pytorch version
+    from chunk_codegen import python_code_with_activation_checkpoint
+    with_codegen = False
+
+
+class MyNet(torch.nn.Module):
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.linear0 = torch.nn.Linear(4, 4)
+        self.linear1 = torch.nn.Linear(4, 4)
+        self.linear2 = torch.nn.Linear(4, 4)
+        self.linear3 = torch.nn.Linear(4, 4)
+        self.linear4 = torch.nn.Linear(4, 4)
+        self.linear5 = torch.nn.Linear(4, 4)
+        self.linear6 = torch.nn.Linear(4, 4)
+
+    def forward(self, x):
+        x = self.linear0(x)
+        x = self.linear1(x)
+        x = self.linear2(x)
+        x = self.linear3(x)
+        x = self.linear4(x)
+        x = self.linear5(x)
+        x = self.linear6(x)
+        return x
+
+
+def _is_all_gradient_close(m: torch.nn.Module, gm: GraphModule) -> bool:
+    for m_p, gm_p in zip(m.parameters(), gm.parameters()):
+        if not torch.allclose(m_p.grad, gm_p.grad):
+            return False
+    return True
+
+
+def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, data: torch.Tensor):
+
+    # test forward
+    non_fx_out = model(data)
+    fx_out = gm(data)
+    assert torch.equal(non_fx_out, fx_out), "fx_out doesn't comply with original output"
+
+    # test barckward
+    loss0 = non_fx_out.sum()
+    loss0.backward()
+    loss1 = fx_out.sum()
+    loss1.backward()
+    assert _is_all_gradient_close(model, gm), "gm doesn't have the same gradient as original one"
+
+
+def _run_offload_codegen(rank):
+    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
+    colossalai.launch(config={}, rank=rank, world_size=1, host='localhost', port=free_port(), backend='nccl')
+
+    # build model and input
+    model = MyNet().cuda()
+    data = torch.rand(4, 4).cuda()
+
+    # trace the module and replace codegen
+    tracer = ColoTracer(trace_act_ckpt=True)
+    graph = tracer.trace(model)
+    codegen = ActivationCheckpointCodeGen()
+    graph.set_codegen(codegen)
+
+    # annotate the activation offload part
+    # also annotate the activation_checkpoint so we could test both types
+    # of input offload
+    for node in graph.nodes:
+        if node.name == "linear0":
+            setattr(node, "activation_offload", [0, True, False])
+        if node.name == "linear1":
+            setattr(node, "activation_offload", [0, True, False])
+        if node.name == "linear2":
+            setattr(node, "activation_offload", [1, True, True])
+        if node.name == "linear4":
+            setattr(node, "activation_offload", [2, False, True])
+        if node.name == "linear5":
+            setattr(node, "activation_checkpoint", [0])
+            setattr(node, "activation_offload", True)
+
+    gm = ColoGraphModule(copy.deepcopy(model), graph)
+    gm.recompile()
+
+    # assert we have all the components
+    code = graph.python_code("self").src
+    assert "def pack_hook_input(self, x):" in code and \
+    "def unpack_hook(self, packed):" in code and \
+    "def pack_hook_no_input(self, x):" in code and \
+    "setattr(x, 'offload', True)" in code and \
+    "setattr(linear3, 'offload', False)" in code and \
+    "with torch.autograd.graph.saved_tensors_hooks(self.pack_hook_input, self.unpack_hook):" in code and \
+    "with torch.autograd.graph.save_on_cpu(pin_memory=True):" in code and \
+    "with torch.autograd.graph.saved_tensors_hooks(self.pack_hook_no_input, self.unpack_hook):" in code and \
+    "colossalai.utils.activation_checkpoint.checkpoint(self.checkpoint_0, True, linear4, use_reentrant=False)" in code
+
+    _test_fwd_and_bwd(model, gm, data)
+    gpc.destroy()
+
+
+@pytest.mark.skipif(not with_codegen, reason='torch version is lower than 1.12.0')
+def test_act_ckpt_codegen():
+    mp.spawn(_run_offload_codegen, nprocs=1)
+
+
+def _run_offload_codegen_torch11(rank):
+    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
+    colossalai.launch(config={}, rank=rank, world_size=1, host='localhost', port=free_port(), backend='nccl')
+
+    # build model and input
+    model = MyNet().cuda()
+    data = torch.rand(4, 4).cuda()
+
+    # trace the module and replace codegen
+    tracer = ColoTracer(trace_act_ckpt=True)
+    graph = tracer.trace(model)
+
+    # replace a bound method of an object
+    graph._python_code = python_code_with_activation_checkpoint.__get__(graph)
+
+    # annotate the activation offload part
+    # also annotate the activation_checkpoint so we could test both types
+    # of input offload
+    for node in graph.nodes:
+        if node.name == "linear0":
+            setattr(node, "activation_offload", [0, True, False])
+        if node.name == "linear1":
+            setattr(node, "activation_offload", [0, True, False])
+        if node.name == "linear2":
+            setattr(node, "activation_offload", [1, True, True])
+        if node.name == "linear4":
+            setattr(node, "activation_offload", [2, False, True])
+        if node.name == "linear5":
+            setattr(node, "activation_checkpoint", [0])
+            setattr(node, "activation_offload", True)
+
+    gm = ColoGraphModule(copy.deepcopy(model), graph)
+    gm.recompile()
+
+    # assert we have all the components
+    code = graph.python_code("self").src
+    assert "def pack_hook_input(self, x):" in code and \
+    "def unpack_hook(self, packed):" in code and \
+    "def pack_hook_no_input(self, x):" in code and \
+    "setattr(x, 'offload', True)" in code and \
+    "setattr(linear3, 'offload', False)" in code and \
+    "with torch.autograd.graph.saved_tensors_hooks(self.pack_hook_input, self.unpack_hook):" in code and \
+    "with torch.autograd.graph.save_on_cpu(pin_memory=True):" in code and \
+    "with torch.autograd.graph.saved_tensors_hooks(self.pack_hook_no_input, self.unpack_hook):" in code and \
+    "colossalai.utils.activation_checkpoint.checkpoint(self.checkpoint_0, True, linear4, use_reentrant=False)" in code
+
+    _test_fwd_and_bwd(model, gm, data)
+    gpc.destroy()
+
+
+@pytest.mark.skip(reason="currently torch11 ColoGraphModule is not implemented")
+def test_act_ckpt_python_code_torch11():
+    mp.spawn(_run_offload_codegen_torch11, nprocs=1)
+
+
+if __name__ == "__main__":
+    _run_offload_codegen(0)

From 87cddf7e147f8db1c9710eb37961c489c09bd5b9 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 27 Oct 2022 16:40:19 +0800
Subject: [PATCH 002/209] rename and remove useless func

---
 chunk_codegen.py     | 398 +++----------------------------------------
 chunk_codegen_run.py |  69 +-------
 2 files changed, 27 insertions(+), 440 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 684028c014de..09fda2b988eb 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -12,7 +12,7 @@
     CODEGEN_AVAILABLE = False
 
 if CODEGEN_AVAILABLE:
-    __all__ = ['ActivationCheckpointCodeGen']
+    __all__ = ['ChunkCodeGen']
 else:
     __all__ = ['python_code_with_activation_checkpoint']
 
@@ -375,7 +375,7 @@ def emit_ckpt_func(body,
             body.append(usage)
 
 
-def emit_code_with_nested_activation_checkpoint(body, ckpt_func, nodes, emit_node_func, delete_unused_value_func):
+def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_value_func):
     """Emit code with nested activation checkpoint
     When we detect some of the node.activation_checkpoint is a List, we will use
     this function to emit the activation checkpoint codes.
@@ -392,21 +392,21 @@ def emit_code_with_nested_activation_checkpoint(body, ckpt_func, nodes, emit_nod
     end_idx = [item[1] for item in ckpt_regions]
 
     # find the offload regions
-    offload_regions, offload_labels = _find_offload_regions(nodes)
-    offload_starts = [item[0] for item in offload_regions]
-    offload_ends = [item[1] for item in offload_regions]
-    offload_inputs = []
-    offload_outputs = []
-    within_offload_region = False
+    chunk_regions, chunk_labels = _find_offload_regions(nodes)
+    chunk_starts = [item[0] for item in chunk_regions]
+    chunk_ends = [item[1] for item in chunk_regions]
+    chunk_inputs = []
+    chunk_outputs = []
+    within_chunk_region = False
 
     node_list = list(nodes)
 
     # find the input and output var names for each offload region
-    for idx, (start, end) in enumerate(offload_regions):
+    for idx, (start, end) in enumerate(chunk_regions):
         offload_node_list = node_list[start:end + 1]
         inputs, outputs = _find_input_and_output_nodes(offload_node_list)
-        offload_inputs.append(inputs)
-        offload_outputs.append(outputs)
+        chunk_inputs.append(inputs)
+        chunk_outputs.append(outputs)
 
     # this flag is to prevent repeated insert of save tensors
     # hooks definition in ckpt_func
@@ -427,10 +427,10 @@ def emit_code_with_nested_activation_checkpoint(body, ckpt_func, nodes, emit_nod
         else:
             node = node_list[node_idx]
 
-            if node_idx in offload_starts:
-                offload_label = offload_labels[offload_starts.index(node_idx)]
-                _, offload_input, offload_bar = offload_label
-                within_offload_region = True
+            if node_idx in chunk_starts:
+                chunk_label = chunk_labels[chunk_starts.index(node_idx)]
+                _, chunk_input, chunk_bar = chunk_label
+                within_chunk_region = True
 
                 # insert hook functions if needed
                 if not is_hook_inserted:
@@ -438,20 +438,20 @@ def emit_code_with_nested_activation_checkpoint(body, ckpt_func, nodes, emit_nod
                     ckpt_func.insert(0, "\n".join([pack_hook, unpack_hook]) + "\n")
                     is_hook_inserted = True
 
-                if offload_input and offload_bar:
+                if chunk_input and chunk_bar:
                     body.append(_gen_save_on_cpu_context())
 
-                elif offload_input:
-                    for par in offload_inputs[offload_label[0]]:
+                elif chunk_input:
+                    for par in chunk_inputs[chunk_label[0]]:
                         body.append(f"setattr({par}, 'offload', True)\n")
                     body.append(_gen_save_tensors_hooks_context(offload_input=True))
 
                 else:
-                    for par in offload_inputs[offload_label[0]]:
+                    for par in chunk_inputs[chunk_label[0]]:
                         body.append(f"setattr({par}, 'offload', False)\n")
                     body.append(_gen_save_tensors_hooks_context(offload_input=False))
 
-            if within_offload_region:
+            if within_chunk_region:
                 emit_node_func(node, body)
                 body[-1] = '    ' + body[-1]
                 delete_unused_value_func(node, body)
@@ -460,150 +460,15 @@ def emit_code_with_nested_activation_checkpoint(body, ckpt_func, nodes, emit_nod
                 emit_node_func(node, body)
                 delete_unused_value_func(node, body)
 
-            if node_idx in offload_ends:
-                within_offload_region = False
+            if node_idx in chunk_ends:
+                within_chunk_region = False
 
             node_idx += 1
 
 
-def emit_code_with_activation_checkpoint(body, ckpt_func, nodes, emit_node_func, delete_unused_value_func):
-    # find the activation checkpoint regions
-    ckpt_regions = _find_ckpt_regions(nodes)
-    start_idx = [item[0] for item in ckpt_regions]
-    end_idx = [item[1] for item in ckpt_regions]
-    input_vars = []
-    output_vars = []
-    within_ckpt_region = False
-
-    # find the offload regions
-    offload_regions, offload_labels = _find_offload_regions(nodes)
-    offload_starts = [item[0] for item in offload_regions]
-    offload_ends = [item[1] for item in offload_regions]
-    offload_inputs = []
-    offload_outputs = []
-    within_offload_region = False
-
-    node_list = list(nodes)
-
-    # use this variable to avoid inserting hook functions
-    # to ckpt_func repeatedly
-    is_hook_inserted = False
-
-    # find the input and output var names for each region
-    for idx, (start, end) in enumerate(ckpt_regions):
-        ckpt_node_list = node_list[start:end + 1]
-        inputs, outputs = _find_input_and_output_nodes(ckpt_node_list)
-        input_vars.append(inputs)
-        output_vars.append(outputs)
-
-    # find the input and output var names for each offload region
-    for idx, (start, end) in enumerate(offload_regions):
-        offload_node_list = node_list[start:end + 1]
-        inputs, outputs = _find_input_and_output_nodes(offload_node_list)
-        offload_inputs.append(inputs)
-        offload_outputs.append(outputs)
-
-    # append code text to body
-    for idx, node in enumerate(node_list):
-        # if this is the first node of the ckpt region
-        # append the ckpt function defition
-        if idx in start_idx:
-            label = start_idx.index(idx)
-            ckpt_fn_def = _gen_ckpt_fn_def(label, input_vars[label])
-            ckpt_func.append(f'{ckpt_fn_def}\n')
-            within_ckpt_region = True
-
-        if idx in offload_starts:
-            offload_label = offload_labels[offload_starts.index(idx)]
-            _, offload_input, offload_bar = offload_label
-            within_offload_region = True
-
-            # insert hook functions if needed
-            if not is_hook_inserted:
-                pack_hook, unpack_hook = _gen_saved_tensors_hooks()
-                ckpt_func.insert(0, "\n".join([pack_hook, unpack_hook]) + "\n")
-                is_hook_inserted = True
-
-            if offload_input and offload_bar:
-                body.append(_gen_save_on_cpu_context())
-
-            elif offload_input:
-                for par in offload_inputs[offload_label[0]]:
-                    body.append(f"setattr({par}, 'offload', True)\n")
-                body.append(_gen_save_tensors_hooks_context(offload_input=True))
-
-            else:
-                for par in offload_inputs[offload_label[0]]:
-                    body.append(f"setattr({par}, 'offload', False)\n")
-                body.append(_gen_save_tensors_hooks_context(offload_input=False))
-
-        # NOTE: emit_node does not emit a string with newline. It depends
-        # on delete_unused_values to append one
-        # NOTE: currently we separate body and ckpt_func definition
-        if within_ckpt_region:
-            emit_node_func(node, ckpt_func)
-            ckpt_func[-1] = '    ' + ckpt_func[-1]
-            delete_unused_value_func(node, ckpt_func)
-
-        elif within_offload_region:
-            emit_node_func(node, body)
-            body[-1] = '    ' + body[-1]
-            delete_unused_value_func(node, body)
-
-        else:
-            emit_node_func(node, body)
-            delete_unused_value_func(node, body)
-
-        if idx in end_idx:
-            # if this is the last node of the ckpt region
-            # generate return statement
-            label = end_idx.index(idx)
-            return_statement = _gen_ckpt_output(output_vars[label])
-            return_statement = f'    {return_statement}\n\n'
-            ckpt_func.append(return_statement)
-
-            # we need to check if the checkpoint need to offload the input
-            start_node_idx = start_idx[label]
-            if hasattr(node_list[start_node_idx], 'activation_offload'):
-                activation_offload = node_list[start_node_idx].activation_offload
-            else:
-                activation_offload = False
-
-            # we need to check if the checkpoint need use_reentrant=False
-            use_reentrant = True
-            non_leaf_input = 0
-            for var in input_vars[label]:
-                input_node = next(item for item in node_list if item.name == var)
-                if input_node.op != "placeholder":
-                    non_leaf_input = 1
-                for user in input_node.users:
-                    if hasattr(user, "activation_checkpoint"):
-                        if user.activation_checkpoint == label:
-                            if user.op == "call_module":
-                                if hasattr(user.graph.owning_module.get_submodule(user.target), "inplace"):
-                                    use_reentrant = not user.graph.owning_module.get_submodule(user.target).inplace
-
-                            elif user.op == "call_function":
-                                if "inplace" in user.kwargs:
-                                    use_reentrant = not user.kwargs["inplace"]
-
-            # if all the inputs are leaf nodes, we need to set use_reentrant = False
-            if not non_leaf_input:
-                use_reentrant = False
-
-            # generate checkpoint function call in a new line
-            usage = _gen_ckpt_usage(label, activation_offload, input_vars[label], output_vars[label], use_reentrant)
-            usage += '\n'
-            body.append(usage)
-            within_ckpt_region = False
-
-        if idx in offload_ends:
-            within_offload_region = False
-
-
 if CODEGEN_AVAILABLE:
 
-    class ActivationCheckpointCodeGen(CodeGen):
+    class ChunkCodeGen(CodeGen):
 
         def _gen_python_code(self, nodes, root_module: str, namespace: _Namespace) -> PythonCode:
             free_vars: List[str] = []
@@ -796,10 +661,7 @@ def emit_node(node: Node, body):
 
             # if any node has a list of labels for activation_checkpoint, we
             # will use nested type of activation checkpoint codegen
-            if any(isinstance(getattr(node, "activation_checkpoint", None), Iterable) for node in nodes):
-                emit_code_with_nested_activation_checkpoint(body, ckpt_func, nodes, emit_node, delete_unused_values)
-            else:
-                emit_code_with_activation_checkpoint(body, ckpt_func, nodes, emit_node, delete_unused_values)
+            emit_code_with_chunk(body, ckpt_func, nodes, emit_node, delete_unused_values)
 
             if len(body) == 0:
                 # If the Graph has no non-placeholder nodes, no lines for the body
@@ -833,215 +695,3 @@ def emit_node(node: Node, body):
 {prologue}
 {code}"""
             return PythonCode(fn_code, globals_)
-
-else:
-
-    def python_code_with_activation_checkpoint(self, root_module: str, namespace: _Namespace) -> PythonCode:
-        """
-        This method is copied from the _python_code of torch.fx.graph.Graph. Modifications are made so that it can generate
-        code for activation checkpoint.
-        """
-        free_vars: List[str] = []
-        body: List[str] = []
-        globals_: Dict[str, Any] = {}
-        wrapped_fns: Dict[str, None] = {}
-
-        # Wrap string in list to pass by reference
-        maybe_return_annotation: List[str] = ['']
-
-        def add_global(name_hint: str, obj: Any):
-            """Add an obj to be tracked as a global.
-
-            We call this for names that reference objects external to the
-            Graph, like functions or types.
-
-            Returns: the global name that should be used to reference 'obj' in generated source.
-            """
-            if _is_from_torch(obj) and obj != torch.device:    # to support registering torch.device
-                # HACK: workaround for how torch custom ops are registered. We
-                # can't import them like normal modules so they must retain their
-                # fully qualified name.
-                return _get_qualified_name(obj)
-
-            # normalize the name hint to get a proper identifier
-            global_name = namespace.create_name(name_hint, obj)
-
-            if global_name in globals_:
-                assert globals_[global_name] is obj
-                return global_name
-            globals_[global_name] = obj
-            return global_name
-
-        # set _custom_builtins here so that we needn't import colossalai in forward
-        _custom_builtins["colossalai"] = _CustomBuiltin("import colossalai", colossalai)
-
-        # Pre-fill the globals table with registered builtins.
-        for name, (_, obj) in _custom_builtins.items():
-            add_global(name, obj)
-
-        def type_repr(o: Any):
-            if o == ():
-                # Empty tuple is used for empty tuple type annotation Tuple[()]
-                return '()'
-
-            typename = _type_repr(o)
-
-            # This is a generic type, e.g. typing.List[torch.Tensor]
-            if hasattr(o, '__origin__'):
-                origin_type = _origin_type_map.get(o.__origin__, o.__origin__)
-                origin_typename = add_global(_type_repr(origin_type), origin_type)
-
-                # Assign global names for each of the inner type variables.
-                args = [type_repr(arg) for arg in o.__args__]
-
-                return f'{origin_typename}[{",".join(args)}]'
-
-            # Common case: this is a regular module name like 'foo.bar.baz'
-            return add_global(typename, o)
-
-        # Run through reverse nodes and record the first instance of a use
-        # of a given node. This represents the *last* use of the node in the
-        # execution order of the program, which we will use to free unused
-        # values
-        node_to_last_use: Dict[Node, Node] = {}
-        user_to_last_uses: Dict[Node, List[Node]] = {}
-
-        def register_last_uses(n: Node, user: Node):
-            if n not in node_to_last_use:
-                node_to_last_use[n] = user
-                user_to_last_uses.setdefault(user, []).append(n)
-
-        for node in reversed(self.nodes):
-            map_arg(node.args, lambda n: register_last_uses(n, node))
-            map_arg(node.kwargs, lambda n: register_last_uses(n, node))
-
-        # NOTE: we add a variable to distinguish body and ckpt_func
-        def delete_unused_values(user: Node, body):
-            """
-            Delete values after their last use. This ensures that values that are
-            not used in the remainder of the code are freed and the memory usage
-            of the code is optimal.
-            """
-            if user.op == 'placeholder':
-                return
-            if user.op == 'output':
-                body.append('\n')
-                return
-            nodes_to_delete = user_to_last_uses.get(user, [])
-            if len(nodes_to_delete):
-                to_delete_str = ' = '.join([repr(n) for n in nodes_to_delete] + ['None'])
-                body.append(f';  {to_delete_str}\n')
-            else:
-                body.append('\n')
-
-        # NOTE: we add a variable to distinguish body and ckpt_func
-        def emit_node(node: Node, body):
-            maybe_type_annotation = '' if node.type is None else f' : {type_repr(node.type)}'
-            if node.op == 'placeholder':
-                assert isinstance(node.target, str)
-                maybe_default_arg = '' if not node.args else f' = {repr(node.args[0])}'
-                free_vars.append(f'{node.target}{maybe_type_annotation}{maybe_default_arg}')
-                raw_name = node.target.replace('*', '')
-                if raw_name != repr(node):
-                    body.append(f'{repr(node)} = {raw_name}\n')
-                return
-            elif node.op == 'call_method':
-                assert isinstance(node.target, str)
-                body.append(f'{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}'
-                            f'({_format_args(node.args[1:], node.kwargs)})')
-                return
-            elif node.op == 'call_function':
-                assert callable(node.target)
-                # pretty print operators
-                if node.target.__module__ == '_operator' and node.target.__name__ in magic_methods:
-                    assert isinstance(node.args, tuple)
-                    body.append(f'{repr(node)}{maybe_type_annotation} = '
-                                f'{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}')
-                    return
-                qualified_name = _get_qualified_name(node.target)
-                global_name = add_global(qualified_name, node.target)
-                # special case for getattr: node.args could be 2-argument or 3-argument
-                # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
-                if global_name == 'getattr' and \
-                   isinstance(node.args, tuple) and \
-                   isinstance(node.args[1], str) and \
-                   node.args[1].isidentifier() and \
-                   len(node.args) == 2:
-                    body.append(
-                        f'{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}')
-                    return
-                body.append(
-                    f'{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})')
-                if node.meta.get('is_wrapped', False):
-                    wrapped_fns.setdefault(global_name)
-                return
-            elif node.op == 'call_module':
-                assert isinstance(node.target, str)
-                body.append(f'{repr(node)}{maybe_type_annotation} = '
-                            f'{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})')
-                return
-            elif node.op == 'get_attr':
-                assert isinstance(node.target, str)
-                body.append(f'{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}')
-                return
-            elif node.op == 'output':
-                if node.type is not None:
-                    maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
-                if self._pytree_info is None:
-                    body.append(f'return {repr(node.args[0])}')
-                else:
-                    body.append(f'return pytree.tree_unflatten({repr(node.args[0])}, self._out_spec)')
-                return
-            raise NotImplementedError(f'node: {node.op} {node.target}')
-
-        # Modified for activation checkpointing
-        ckpt_func = []
-
-        # if any node has a list of labels for activation_checkpoint, we
-        # will use nested type of activation checkpoint codegen
-        if any(isinstance(getattr(node, "activation_checkpoint", None), Iterable) for node in self.nodes):
-            emit_code_with_nested_activation_checkpoint(body, ckpt_func, self.nodes, emit_node, delete_unused_values)
-        else:
-            emit_code_with_activation_checkpoint(body, ckpt_func, self.nodes, emit_node, delete_unused_values)
-
-        if len(body) == 0:
-            # If the Graph has no non-placeholder nodes, no lines for the body
-            # have been emitted. To continue to have valid Python code, emit a
-            # single pass statement
-            body.append('pass\n')
-        if self._pytree_info is not None:
-            orig_args = self._pytree_info.orig_args
-            has_orig_self = (orig_args[0] == 'self')
-            if has_orig_self:
-                free_vars.insert(0, 'self')
-            if len(free_vars) > 0:    # pytree has placeholders in it
-                body.insert(
-                    0,
-                    f"{', '.join(free_vars)}, = fx_pytree.tree_flatten_spec([{', '.join(orig_args)}], self._in_spec)\n")
-        else:
-            orig_args = free_vars
-
-        if len(wrapped_fns) > 0:
-            wrap_name = add_global('wrap', torch.fx.wrap)
-            wrap_stmts = '\n'.join([f'{wrap_name}("{name}")' for name in wrapped_fns])
-        else:
-            wrap_stmts = ''
-
-        ckpt_func = ''.join(ckpt_func)
-
-        # If the original function didn't have self as its first argument, we
-        # would have added it.
-        if len(orig_args) == 0 or orig_args[0] != 'self':
-            orig_args.insert(0, 'self')
-        code = ''.join(body)
-        code = '\n'.join('    ' + line for line in code.split('\n'))
-
-        # as we need colossalai.utils.checkpoint, we need to import colossalai
-        # in forward function
-        fn_code = f"""
-{wrap_stmts}
-
-{ckpt_func}
-def forward({', '.join(orig_args)}){maybe_return_annotation[0]}:
-{code}"""
-        return PythonCode(fn_code, globals_)
diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
index 9ac399a29b51..85164bdada96 100644
--- a/chunk_codegen_run.py
+++ b/chunk_codegen_run.py
@@ -11,7 +11,7 @@
 from colossalai.fx.graph_module import ColoGraphModule
 
 try:
-    from chunk_codegen import ActivationCheckpointCodeGen
+    from chunk_codegen import ChunkCodeGen
     with_codegen = True
 except:
     # fall back to older pytorch version
@@ -75,7 +75,7 @@ def _run_offload_codegen(rank):
     # trace the module and replace codegen
     tracer = ColoTracer(trace_act_ckpt=True)
     graph = tracer.trace(model)
-    codegen = ActivationCheckpointCodeGen()
+    codegen = ChunkCodeGen()
     graph.set_codegen(codegen)
 
     # annotate the activation offload part
@@ -99,15 +99,7 @@ def _run_offload_codegen(rank):
 
     # assert we have all the components
     code = graph.python_code("self").src
-    assert "def pack_hook_input(self, x):" in code and \
-    "def unpack_hook(self, packed):" in code and \
-    "def pack_hook_no_input(self, x):" in code and \
-    "setattr(x, 'offload', True)" in code and \
-    "setattr(linear3, 'offload', False)" in code and \
-    "with torch.autograd.graph.saved_tensors_hooks(self.pack_hook_input, self.unpack_hook):" in code and \
-    "with torch.autograd.graph.save_on_cpu(pin_memory=True):" in code and \
-    "with torch.autograd.graph.saved_tensors_hooks(self.pack_hook_no_input, self.unpack_hook):" in code and \
-    "colossalai.utils.activation_checkpoint.checkpoint(self.checkpoint_0, True, linear4, use_reentrant=False)" in code
+    print(code)
 
     _test_fwd_and_bwd(model, gm, data)
     gpc.destroy()
@@ -118,60 +110,5 @@ def test_act_ckpt_codegen():
     mp.spawn(_run_offload_codegen, nprocs=1)
 
 
-def _run_offload_codegen_torch11(rank):
-    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
-    colossalai.launch(config={}, rank=rank, world_size=1, host='localhost', port=free_port(), backend='nccl')
-
-    # build model and input
-    model = MyNet().cuda()
-    data = torch.rand(4, 4).cuda()
-
-    # trace the module and replace codegen
-    tracer = ColoTracer(trace_act_ckpt=True)
-    graph = tracer.trace(model)
-
-    # replace a bound method of an object
-    graph._python_code = python_code_with_activation_checkpoint.__get__(graph)
-
-    # annotate the activation offload part
-    # also annotate the activation_checkpoint so we could test both types
-    # of input offload
-    for node in graph.nodes:
-        if node.name == "linear0":
-            setattr(node, "activation_offload", [0, True, False])
-        if node.name == "linear1":
-            setattr(node, "activation_offload", [0, True, False])
-        if node.name == "linear2":
-            setattr(node, "activation_offload", [1, True, True])
-        if node.name == "linear4":
-            setattr(node, "activation_offload", [2, False, True])
-        if node.name == "linear5":
-            setattr(node, "activation_checkpoint", [0])
-            setattr(node, "activation_offload", True)
-
-    gm = ColoGraphModule(copy.deepcopy(model), graph)
-    gm.recompile()
-
-    # assert we have all the components
-    code = graph.python_code("self").src
-    assert "def pack_hook_input(self, x):" in code and \
-    "def unpack_hook(self, packed):" in code and \
-    "def pack_hook_no_input(self, x):" in code and \
-    "setattr(x, 'offload', True)" in code and \
-    "setattr(linear3, 'offload', False)" in code and \
-    "with torch.autograd.graph.saved_tensors_hooks(self.pack_hook_input, self.unpack_hook):" in code and \
-    "with torch.autograd.graph.save_on_cpu(pin_memory=True):" in code and \
-    "with torch.autograd.graph.saved_tensors_hooks(self.pack_hook_no_input, self.unpack_hook):" in code and \
-    "colossalai.utils.activation_checkpoint.checkpoint(self.checkpoint_0, True, linear4, use_reentrant=False)" in code
-
-    _test_fwd_and_bwd(model, gm, data)
-    gpc.destroy()
-
-
-@pytest.mark.skip(reason="currently torch11 ColoGraphModule is not implemented")
-def test_act_ckpt_python_code_torch11():
-    mp.spawn(_run_offload_codegen_torch11, nprocs=1)
-
-
 if __name__ == "__main__":
     _run_offload_codegen(0)

From 78cfe4362b4550635f609a8b52a8489c7f9aa564 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Wed, 2 Nov 2022 13:59:48 +0800
Subject: [PATCH 003/209] basic chunk

---
 chunk_codegen.py     | 66 ++++++++++++++++++++++----------------------
 chunk_codegen_run.py | 15 +++++-----
 2 files changed, 41 insertions(+), 40 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 09fda2b988eb..c605e35f4725 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -46,6 +46,19 @@ def pack_hook_no_input(self, x):
     return pack_hook, unpack_hook
 
 
+def _gen_loop_5(to_keep):
+    context = "chunk_result = []\nfor gen_loop_idx in range(4):\n"
+    context += "    chunk_tensor = " + to_keep + "[gen_loop_idx, :]\n"
+    return context
+
+
+def _gen_loop_5_final(final_name, to_keep):
+    context = "    chunk_result.append(" + final_name + ")\n"
+    context += "chunk_result = torch.cat(chunk_result, dim=0);  " + to_keep[0] + " = None\n"
+    context += final_name + " = chunk_result; chunk_result = None\n"
+    return context
+
+    
 def _gen_save_tensors_hooks_context(offload_input=True) -> str:
     """Generate customized saved_tensors_hooks
 
@@ -410,57 +423,40 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
 
     # this flag is to prevent repeated insert of save tensors
     # hooks definition in ckpt_func
-    is_hook_inserted = False
     node_idx = 0
-    while 1:
+    to_keep = []
+    while node_idx < len(node_list):
         # break if we finish the processing all the nodes
         if node_idx >= len(node_list):
             break
 
-        # process ckpt_regions
-        if node_idx in start_idx:
-            ckpt_node_list = node_list[node_idx:end_idx[start_idx.index(node_idx)] + 1]
-            emit_ckpt_func(body, ckpt_func, ckpt_node_list, emit_node_func, delete_unused_value_func)
-            node_idx += len(ckpt_node_list)
-
         # process node in forward function
         else:
             node = node_list[node_idx]
 
             if node_idx in chunk_starts:
-                chunk_label = chunk_labels[chunk_starts.index(node_idx)]
-                _, chunk_input, chunk_bar = chunk_label
+                # save chunk input var, dont delete it
+                to_keep.extend(node.args[0].name)
                 within_chunk_region = True
-
-                # insert hook functions if needed
-                if not is_hook_inserted:
-                    pack_hook, unpack_hook = _gen_saved_tensors_hooks()
-                    ckpt_func.insert(0, "\n".join([pack_hook, unpack_hook]) + "\n")
-                    is_hook_inserted = True
-
-                if chunk_input and chunk_bar:
-                    body.append(_gen_save_on_cpu_context())
-
-                elif chunk_input:
-                    for par in chunk_inputs[chunk_label[0]]:
-                        body.append(f"setattr({par}, 'offload', True)\n")
-                    body.append(_gen_save_tensors_hooks_context(offload_input=True))
-
-                else:
-                    for par in chunk_inputs[chunk_label[0]]:
-                        body.append(f"setattr({par}, 'offload', False)\n")
-                    body.append(_gen_save_tensors_hooks_context(offload_input=False))
+                # add for loop
+                body.append(_gen_loop_5(to_keep[0]))
+                # change first node's input to new chunked var
+                node_args = list(node.args)
+                node_args[0] = 'chunk_tensor'
 
             if within_chunk_region:
                 emit_node_func(node, body)
                 body[-1] = '    ' + body[-1]
-                delete_unused_value_func(node, body)
+                delete_unused_value_func(node, body, to_keep)
 
             else:
                 emit_node_func(node, body)
-                delete_unused_value_func(node, body)
+                if node_idx not in chunk_inputs:
+                    delete_unused_value_func(node, body, to_keep)
 
             if node_idx in chunk_ends:
+                body.append(_gen_loop_5_final(node.name, to_keep))
+                to_keep = []
                 within_chunk_region = False
 
             node_idx += 1
@@ -572,7 +568,7 @@ def register_last_uses(n: Node, user: Node):
                 map_arg(node.kwargs, lambda n: register_last_uses(n, node))
 
             # NOTE: we add a variable to distinguish body and ckpt_func
-            def delete_unused_values(user: Node, body):
+            def delete_unused_values(user: Node, body, to_keep=[]):
                 """
                 Delete values after their last use. This ensures that values that are
                 not used in the remainder of the code are freed and the memory usage
@@ -584,6 +580,9 @@ def delete_unused_values(user: Node, body):
                     body.append('\n')
                     return
                 nodes_to_delete = user_to_last_uses.get(user, [])
+                for n in nodes_to_delete:
+                    if n.name in to_keep:
+                        nodes_to_delete.remove(n)
                 if len(nodes_to_delete):
                     to_delete_str = ' = '.join([repr(n) for n in nodes_to_delete] + ['None'])
                     body.append(f';  {to_delete_str}\n')
@@ -693,5 +692,6 @@ def emit_node(node: Node, body):
 {wrap_stmts}
 
 {prologue}
-{code}"""
+{code}"""   
+            print(fn_code)
             return PythonCode(fn_code, globals_)
diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
index 85164bdada96..69b327d4bd5b 100644
--- a/chunk_codegen_run.py
+++ b/chunk_codegen_run.py
@@ -54,6 +54,7 @@ def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, data: torch.T
     # test forward
     non_fx_out = model(data)
     fx_out = gm(data)
+    print(non_fx_out.shape, fx_out.shape)
     assert torch.equal(non_fx_out, fx_out), "fx_out doesn't comply with original output"
 
     # test barckward
@@ -86,13 +87,13 @@ def _run_offload_codegen(rank):
             setattr(node, "activation_offload", [0, True, False])
         if node.name == "linear1":
             setattr(node, "activation_offload", [0, True, False])
-        if node.name == "linear2":
-            setattr(node, "activation_offload", [1, True, True])
-        if node.name == "linear4":
-            setattr(node, "activation_offload", [2, False, True])
-        if node.name == "linear5":
-            setattr(node, "activation_checkpoint", [0])
-            setattr(node, "activation_offload", True)
+        # if node.name == "linear2":
+        #     setattr(node, "activation_offload", [1, True, True])
+        # if node.name == "linear4":
+        #     setattr(node, "activation_offload", [2, False, True])
+        # if node.name == "linear5":
+        #     setattr(node, "activation_checkpoint", [0])
+        #     setattr(node, "activation_offload", True)
 
     gm = ColoGraphModule(copy.deepcopy(model), graph)
     gm.recompile()

From 86f2a3147415f2afe53019cd7b9d9414de1510e9 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Wed, 2 Nov 2022 15:12:08 +0800
Subject: [PATCH 004/209] add evoformer

---
 evoformer/evoformer.py   |  47 ++++++++++
 evoformer/initializer.py |  29 ++++++
 evoformer/kernel.py      |  19 ++++
 evoformer/msa.py         |  95 +++++++++++++++++++
 evoformer/ops.py         | 176 +++++++++++++++++++++++++++++++++++
 evoformer/triangle.py    | 192 +++++++++++++++++++++++++++++++++++++++
 6 files changed, 558 insertions(+)
 create mode 100644 evoformer/evoformer.py
 create mode 100755 evoformer/initializer.py
 create mode 100644 evoformer/kernel.py
 create mode 100644 evoformer/msa.py
 create mode 100755 evoformer/ops.py
 create mode 100644 evoformer/triangle.py

diff --git a/evoformer/evoformer.py b/evoformer/evoformer.py
new file mode 100644
index 000000000000..ef3df2769840
--- /dev/null
+++ b/evoformer/evoformer.py
@@ -0,0 +1,47 @@
+import torch
+import torch.nn as nn
+
+from .msa import MSAStack
+from .ops import OutProductMean
+from .triangle import PairStack
+
+
+class EvoformerBlock(nn.Module):
+
+    def __init__(self, d_node, d_pair):
+        super(EvoformerBlock, self).__init__()
+
+        self.msa_stack = MSAStack(d_node, d_pair, p_drop=0.15)
+        self.communication = OutProductMean(n_feat=d_node, n_feat_out=d_pair, n_feat_proj=32)
+        self.pair_stack = PairStack(d_pair=d_pair)
+
+    def forward(self, node, pair):
+        node = node + self.msa_stack(node, pair)
+        pair = pair + self.communication(node)
+        pair = pair + self.pair_stack(pair)
+        return node, pair
+
+
+class Evoformer(nn.Module):
+
+    def __init__(self, d_node, d_pair):
+        super(Evoformer, self).__init__()
+
+        self.blocks = nn.ModuleList()
+        for _ in range(3):
+            self.blocks.append(EvoformerBlock(d_node, d_pair))
+
+    def forward(self, node, pair):
+        for b in self.blocks:
+            node, pair = b(node, pair)
+        return node, pair
+
+def evoformer_base():
+    return Evoformer(d_node=256, d_pair=128)
+
+
+def evoformer_large():
+    return Evoformer(d_node=512, d_pair=256)
+
+
+__all__ = ['Evoformer', 'evoformer_base', 'evoformer_large']
diff --git a/evoformer/initializer.py b/evoformer/initializer.py
new file mode 100755
index 000000000000..c6ce0659e597
--- /dev/null
+++ b/evoformer/initializer.py
@@ -0,0 +1,29 @@
+import math
+
+import numpy as np
+import torch.nn as nn
+
+
+def glorot_uniform_af(x, gain=1.0):
+    """
+    initialize tensors the same as xavier_initializer in PyTorch, but the dimensions are different:
+    In PyTorch:
+    [feature_out, feature_in, n_head ...]
+    In Jax:
+    [... n_head, feature_in, feature_out]
+    However, there is a feature in original Alphafold2 code that they use the Jax version initializer to initialize tensors like:
+    [feature_in, n_head, feature_out]
+
+    In this function, we keep this feature to initialize [feature_in, n_head, ..., feature_out] tensors
+    """
+    fan_in, fan_out = x.shape[-2:]
+    if len(x.shape) > 2:
+        receptive_field_size = np.prod(x.shape[:-2])
+        fan_in *= receptive_field_size
+        fan_out *= receptive_field_size
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    dev = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
+
+    nn.init.uniform_(x, -dev, dev)
+
+    return x
diff --git a/evoformer/kernel.py b/evoformer/kernel.py
new file mode 100644
index 000000000000..2655901a2fe9
--- /dev/null
+++ b/evoformer/kernel.py
@@ -0,0 +1,19 @@
+import torch
+import torch.nn.functional as F
+
+
+def bias_sigmod_ele(y, bias, z):
+    return torch.sigmoid(y + bias) * z
+
+
+def bias_dropout_add(x: torch.Tensor, bias: torch.Tensor, dropmask: torch.Tensor,
+                     residual: torch.Tensor, prob: float) -> torch.Tensor:
+    out = (x + bias) * F.dropout(dropmask, p=prob, training=True)
+    out = residual + out
+    return out
+
+
+def bias_ele_dropout_residual(ab: torch.Tensor, b: torch.Tensor, g: torch.Tensor,
+                              dropout_mask: torch.Tensor, Z_raw: torch.Tensor,
+                              prob: float) -> torch.Tensor:
+    return Z_raw + F.dropout(dropout_mask, p=prob, training=True) * (g * (ab + b))
\ No newline at end of file
diff --git a/evoformer/msa.py b/evoformer/msa.py
new file mode 100644
index 000000000000..ccefa38c48be
--- /dev/null
+++ b/evoformer/msa.py
@@ -0,0 +1,95 @@
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn import LayerNorm
+
+from .kernel import bias_dropout_add
+from .ops import SelfAttention, Transition
+
+
+class MSARowAttentionWithPairBias(nn.Module):
+
+    def __init__(self, d_node, d_pair, c=32, n_head=8, p_drop=0.15):
+        super(MSARowAttentionWithPairBias, self).__init__()
+        self.d_node = d_node
+        self.d_pair = d_pair
+        self.c = c
+        self.n_head = n_head
+        self.p_drop = p_drop
+
+        self.layernormM = LayerNorm(d_node)
+        self.layernormZ = LayerNorm(d_pair)
+
+        _init_weights = torch.nn.init.normal_(torch.zeros([n_head, d_pair]),
+                                              std=1.0 / math.sqrt(d_pair))
+        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights, requires_grad=True)
+
+        self.attention = SelfAttention(qkv_dim=d_node,
+                                       c=c,
+                                       n_head=n_head,
+                                       out_dim=d_node,
+                                       gating=True,
+                                       last_bias_fuse=True)
+
+        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_node,)), requires_grad=True)
+
+    def forward(self, M_raw, Z):
+        ## Input projections
+        M = self.layernormM(M_raw)
+        Z = self.layernormZ(Z)
+        b = F.linear(Z, self.linear_b_weights)
+        b = b.permute(0, 3, 1, 2)
+        # b = rearrange(b, 'b q k h -> b h q k')
+
+        M = self.attention(M, b)
+        dropout_mask = torch.ones_like(M[:, 0:1, :, :], device=M.device, dtype=M.dtype)
+
+        return bias_dropout_add(M, self.out_bias, dropout_mask, M_raw, prob=self.p_drop)
+
+
+class MSAColumnAttention(nn.Module):
+
+    def __init__(self, d_node, c=32, n_head=8):
+        super(MSAColumnAttention, self).__init__()
+        self.d_node = d_node
+        self.c = c
+        self.n_head = n_head
+
+        self.layernormM = LayerNorm(d_node)
+        self.attention = SelfAttention(qkv_dim=d_node,
+                                       c=c,
+                                       n_head=n_head,
+                                       out_dim=d_node,
+                                       gating=True)
+
+    def forward(self, M_raw):
+        M = M_raw.transpose(-2, -3)
+        M = self.layernormM(M)
+
+        M = self.attention(M)
+
+        M = M.transpose(-2, -3)
+        return M_raw + M
+
+
+class MSAStack(nn.Module):
+
+    def __init__(self, d_node, d_pair, p_drop=0.15):
+        super(MSAStack, self).__init__()
+
+        self.MSARowAttentionWithPairBias = MSARowAttentionWithPairBias(d_node=d_node,
+                                                                       d_pair=d_pair,
+                                                                       p_drop=p_drop)
+
+        self.MSAColumnAttention = MSAColumnAttention(d_node=d_node)
+        self.MSATransition = Transition(d=d_node)
+
+    def forward(self, node, pair):
+        node = self.MSARowAttentionWithPairBias(node, pair)
+        node = self.MSAColumnAttention(node)
+        node = self.MSATransition(node)
+
+        return node
diff --git a/evoformer/ops.py b/evoformer/ops.py
new file mode 100755
index 000000000000..ddbba441dd5f
--- /dev/null
+++ b/evoformer/ops.py
@@ -0,0 +1,176 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn import LayerNorm
+
+from .initializer import glorot_uniform_af
+from .kernel import bias_sigmod_ele
+
+
+class DropoutRowwise(nn.Module):
+
+    def __init__(self, p):
+        super(DropoutRowwise, self).__init__()
+        self.p = p
+        self.dropout = nn.Dropout(p=p)
+
+    def forward(self, x):
+        dropout_mask = torch.ones_like(x[:, 0:1, :, :])
+        dropout_mask = self.dropout(dropout_mask)
+        return dropout_mask * x
+
+
+class DropoutColumnwise(nn.Module):
+
+    def __init__(self, p):
+        super(DropoutColumnwise, self).__init__()
+        self.p = p
+        self.dropout = nn.Dropout(p=p)
+
+    def forward(self, x):
+        dropout_mask = torch.ones_like(x[:, :, 0:1, :])
+        dropout_mask = self.dropout(dropout_mask)
+        return dropout_mask * x
+
+
+class Transition(nn.Module):
+
+    def __init__(self, d, n=4):
+        super(Transition, self).__init__()
+        self.norm = LayerNorm(d)
+        self.linear1 = Linear(d, n * d, initializer='relu')
+        self.linear2 = Linear(n * d, d, initializer='zeros')
+
+    def forward(self, src):
+        x = self.norm(src)
+        x = self.linear2(F.relu(self.linear1(x)))
+        return src + x
+
+
+class OutProductMean(nn.Module):
+
+    def __init__(self, n_feat=64, n_feat_out=128, n_feat_proj=32):
+        super(OutProductMean, self).__init__()
+
+        self.layernormM = LayerNorm(n_feat)
+        self.linear_a = Linear(n_feat, n_feat_proj)
+        self.linear_b = Linear(n_feat, n_feat_proj)
+
+        self.o_linear = Linear(n_feat_proj * n_feat_proj,
+                               n_feat_out,
+                               initializer='zero',
+                               use_bias=True)
+
+    def forward(self, M):
+        M = self.layernormM(M)
+        left_act = self.linear_a(M)
+        right_act = self.linear_b(M)
+
+        O = torch.einsum('bsid,bsje->bijde', left_act, right_act).contiguous()
+        # O = rearrange(O, 'b i j d e -> b i j (d e)')
+        O = O.reshape(O.shape[0], O.shape[1], O.shape[2], -1)
+        Z = self.o_linear(O)
+
+        return Z
+
+
+class Linear(nn.Linear):
+    """
+    A Linear layer with built-in nonstandard initializations. Called just
+    like torch.nn.Linear.
+    Implements the initializers in 1.11.4, plus some additional ones found
+    in the code.
+    """
+
+    def __init__(
+        self,
+        feature_in: int,
+        feature_out: int,
+        initializer: str = 'linear',
+        use_bias: bool = True,
+        bias_init: float = 0.,
+    ):
+        super(Linear, self).__init__(feature_in, feature_out, bias=use_bias)
+
+        self.use_bias = use_bias
+        if initializer == 'linear':
+            glorot_uniform_af(self.weight, gain=1.0)
+        elif initializer == 'relu':
+            glorot_uniform_af(self.weight, gain=2.0)
+        elif initializer == 'zeros':
+            nn.init.zeros_(self.weight)
+        if self.use_bias:
+            with torch.no_grad():
+                self.bias.fill_(bias_init)
+
+
+class SelfAttention(nn.Module):
+    """
+    Multi-Head SelfAttention dealing with [batch_size1, batch_size2, len, dim] tensors
+    """
+
+    def __init__(self, qkv_dim, c, n_head, out_dim, gating=True, last_bias_fuse=False):
+        super(SelfAttention, self).__init__()
+        self.qkv_dim = qkv_dim
+        self.c = c
+        self.n_head = n_head
+        self.out_dim = out_dim
+        self.gating = gating
+        self.last_bias_fuse = last_bias_fuse
+
+        self.scaling = self.c**(-0.5)
+
+        # self.to_qkv = Linear(qkv_dim, 3 * n_head * c, initializer='linear')
+        self.to_q = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
+        self.to_k = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
+        self.to_v = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
+
+        if gating:
+            self.gating_bias = nn.parameter.Parameter(data=torch.ones((n_head * c,)))
+            self.gating_linear = Linear(qkv_dim, n_head * c, initializer='zero', use_bias=False)
+
+        self.o_linear = Linear(n_head * c,
+                               out_dim,
+                               initializer='zero',
+                               use_bias=(not last_bias_fuse))
+
+    def forward(self, in_data, nonbatched_bias=None):
+        """
+        :param in_data: [batch_size1, batch_size2, len_qkv, qkv_dim]
+        :param bias: None or [batch_size1, batch_size2, n_head, len_q, len_kv]
+        :param nonbatched_bias: None or [batch_size1, n_head, len_q, len_kv]
+        """
+
+        # qkv = self.to_qkv(in_data).chunk(3, dim=-1)
+        # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head), qkv)
+
+        q = self.to_q(in_data)
+        k = self.to_k(in_data)
+        v = self.to_k(in_data)
+
+        # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head),
+        #               [q, k, v])
+        q, k, v = map(lambda t: t.view(t.shape[0], t.shape[1], t.shape[2], self.n_head, -1).permute(0, 1, 3, 2, 4),
+                      [q, k, v])
+        
+        q = q * self.scaling
+
+        logits = torch.matmul(q, k.transpose(-1, -2))
+
+        if nonbatched_bias is not None:
+            logits += nonbatched_bias.unsqueeze(1)
+        weights = torch.softmax(logits, dim=-1)
+        # weights = softmax(logits)
+
+        weighted_avg = torch.matmul(weights, v)
+        # weighted_avg = rearrange(weighted_avg, 'b1 b2 h n d -> b1 b2 n (h d)')
+        weighted_avg = weighted_avg.permute(0, 1, 3, 2, 4)
+        weighted_avg = weighted_avg.reshape(weighted_avg.shape[0], weighted_avg.shape[1], weighted_avg.shape[2], -1)
+
+        if self.gating:
+            gate_values = self.gating_linear(in_data)
+            weighted_avg = bias_sigmod_ele(gate_values, self.gating_bias, weighted_avg)
+
+        output = self.o_linear(weighted_avg)
+        return output
diff --git a/evoformer/triangle.py b/evoformer/triangle.py
new file mode 100644
index 000000000000..7db0482f5557
--- /dev/null
+++ b/evoformer/triangle.py
@@ -0,0 +1,192 @@
+import math
+
+import torch
+import torch.nn as nn
+from torch.nn import LayerNorm
+
+from .kernel import bias_dropout_add, bias_ele_dropout_residual
+from .ops import Linear, SelfAttention, Transition
+
+
+def permute_final_dims(tensor, inds):
+    zero_index = -1 * len(inds)
+    first_inds = list(range(len(tensor.shape[:zero_index])))
+    return tensor.permute(first_inds + [zero_index + i for i in inds])
+
+
+class TriangleMultiplicationOutgoing(nn.Module):
+
+    def __init__(self, d_pair, p_drop, c=128):
+        super(TriangleMultiplicationOutgoing, self).__init__()
+        self.d_pair = d_pair
+        self.c = c
+
+        self.layernorm1 = LayerNorm(d_pair)
+        self.left_projection = Linear(d_pair, c)
+        self.right_projection = Linear(d_pair, c)
+        self.left_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
+        self.right_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
+
+        self.output_gate = Linear(d_pair, d_pair, initializer='zeros', bias_init=1.)
+        self.layernorm2 = LayerNorm(c)
+        self.output_projection = Linear(d_pair, d_pair, initializer='zeros', use_bias=False)
+        self.output_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
+
+        self.p_drop = p_drop
+
+    def forward(self, Z_raw):
+        Z = self.layernorm1(Z_raw)
+        left_proj_act = self.left_projection(Z)
+        right_proj_act = self.right_projection(Z)
+
+        left_proj_act = left_proj_act * torch.sigmoid(self.left_gate(Z))
+        right_proj_act = right_proj_act * torch.sigmoid(self.right_gate(Z))
+
+        g = torch.sigmoid(self.output_gate(Z))
+        # p = torch.matmul(
+        #     permute_final_dims(left_proj_act, (2, 0, 1)),
+        #     permute_final_dims(right_proj_act, (2, 1, 0)),
+        # )
+        # ab = permute_final_dims(p, (1, 2, 0))
+
+        ab = torch.einsum('bikd,bjkd->bijd', left_proj_act, right_proj_act)
+        ab = self.output_projection(self.layernorm2(ab))
+        dropout_mask = torch.ones_like(Z[:, 0:1, :, :], device=Z.device, dtype=Z.dtype)
+        return bias_ele_dropout_residual(ab,
+                                         self.output_bias,
+                                         g,
+                                         dropout_mask,
+                                         Z_raw,
+                                         prob=self.p_drop)
+
+
+class TriangleMultiplicationIncoming(nn.Module):
+
+    def __init__(self, d_pair, p_drop, c=128):
+        super(TriangleMultiplicationIncoming, self).__init__()
+        self.d_pair = d_pair
+        self.c = c
+
+        self.layernorm1 = LayerNorm(d_pair)
+        self.left_projection = Linear(d_pair, c)
+        self.right_projection = Linear(d_pair, c)
+        self.left_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
+        self.right_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
+
+        self.output_gate = Linear(d_pair, d_pair, initializer='zeros', bias_init=1.)
+        self.layernorm2 = LayerNorm(c)
+        self.output_projection = Linear(d_pair, d_pair, initializer='zeros', use_bias=False)
+        self.output_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
+
+        self.p_drop = p_drop
+
+    def forward(self, Z_raw):
+        Z = self.layernorm1(Z_raw)
+        left_proj_act = self.left_projection(Z)
+        right_proj_act = self.right_projection(Z)
+
+        left_proj_act = left_proj_act * torch.sigmoid(self.left_gate(Z))
+        right_proj_act = right_proj_act * torch.sigmoid(self.right_gate(Z))
+
+        g = torch.sigmoid(self.output_gate(Z))
+        # p = torch.matmul(
+        #     permute_final_dims(left_proj_act, (2, 1, 0)),
+        #     permute_final_dims(right_proj_act, (2, 0, 1)),
+        # )
+        # ab = permute_final_dims(p, (1, 2, 0))
+
+        ab = torch.einsum('bkid,bkjd->bijd', left_proj_act, right_proj_act)
+        ab = self.output_projection(self.layernorm2(ab))
+        dropout_mask = torch.ones_like(Z[:, 0:1, :, :], device=Z.device, dtype=Z.dtype)
+        return bias_ele_dropout_residual(ab,
+                                         self.output_bias,
+                                         g,
+                                         dropout_mask,
+                                         Z_raw,
+                                         prob=self.p_drop)
+
+
+class TriangleAttentionStartingNode(nn.Module):
+
+    def __init__(self, d_pair, p_drop, c=32, n_head=4):
+        super(TriangleAttentionStartingNode, self).__init__()
+        self.d_pair = d_pair
+        self.c = c
+        self.n_head = n_head
+        self.p_drop = p_drop
+
+        self.layernorm1 = LayerNorm(d_pair)
+        _init_weights = torch.nn.init.normal_(torch.zeros([d_pair, n_head]),
+                                              std=1.0 / math.sqrt(d_pair))
+        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights)
+        self.attention = SelfAttention(qkv_dim=d_pair,
+                                       c=c,
+                                       n_head=n_head,
+                                       out_dim=d_pair,
+                                       gating=True,
+                                       last_bias_fuse=True)
+
+        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
+
+    def forward(self, Z_raw):
+        Z = self.layernorm1(Z_raw)
+        b = torch.einsum('bqkc,ch->bhqk', Z, self.linear_b_weights)
+
+        Z = self.attention(Z, b)
+
+        dropout_mask = torch.ones_like(Z[:, 0:1, :, :], device=Z.device, dtype=Z.dtype)
+        return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
+
+
+class TriangleAttentionEndingNode(nn.Module):
+
+    def __init__(self, d_pair, p_drop, c=32, n_head=4):
+        super(TriangleAttentionEndingNode, self).__init__()
+        self.d_pair = d_pair
+        self.c = c
+        self.n_head = n_head
+        self.p_drop = p_drop
+
+        self.layernorm1 = LayerNorm(d_pair)
+        _init_weights = torch.nn.init.normal_(torch.zeros([d_pair, n_head]),
+                                              std=1.0 / math.sqrt(d_pair))
+        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights)
+        self.attention = SelfAttention(qkv_dim=d_pair,
+                                       c=c,
+                                       n_head=n_head,
+                                       out_dim=d_pair,
+                                       gating=True,
+                                       last_bias_fuse=True)
+
+        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
+
+    def forward(self, Z_raw):
+        Z = Z_raw.transpose(-2, -3)
+        Z = self.layernorm1(Z)
+        b = torch.einsum('bqkc,ch->bhqk', Z, self.linear_b_weights)
+
+        Z = self.attention(Z, b)
+
+        Z = Z.transpose(-2, -3)
+        dropout_mask = torch.ones_like(Z[:, :, 0:1, :], device=Z.device, dtype=Z.dtype)
+        return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
+
+
+class PairStack(nn.Module):
+
+    def __init__(self, d_pair, p_drop=0.25):
+        super(PairStack, self).__init__()
+
+        self.TriangleMultiplicationOutgoing = TriangleMultiplicationOutgoing(d_pair, p_drop=p_drop)
+        self.TriangleMultiplicationIncoming = TriangleMultiplicationIncoming(d_pair, p_drop=p_drop)
+        self.TriangleAttentionStartingNode = TriangleAttentionStartingNode(d_pair, p_drop=p_drop)
+        self.TriangleAttentionEndingNode = TriangleAttentionEndingNode(d_pair, p_drop=p_drop)
+        self.PairTransition = Transition(d=d_pair)
+
+    def forward(self, pair):
+        pair = self.TriangleMultiplicationOutgoing(pair)
+        pair = self.TriangleMultiplicationIncoming(pair)
+        pair = self.TriangleAttentionStartingNode(pair)
+        pair = self.TriangleAttentionEndingNode(pair)
+        pair = self.PairTransition(pair)
+        return pair

From 820ea4d056e4ca943ca1d143325fb582128a1b96 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Wed, 2 Nov 2022 15:49:25 +0800
Subject: [PATCH 005/209] align evoformer

---
 chunk_codegen.py       | 143 ++++++-----------------------------------
 chunk_codegen_run.py   |  97 ++++++++++------------------
 evoformer/evoformer.py |   7 +-
 evoformer/kernel.py    |   2 +-
 evoformer/msa.py       |   2 +-
 evoformer/triangle.py  |   8 +--
 6 files changed, 67 insertions(+), 192 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index c605e35f4725..cb2a3a8a90ee 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1,5 +1,6 @@
 import colossalai
 import torch
+import copy
 from typing import List, Callable, Any, Tuple, Dict, Iterable
 
 try:
@@ -17,74 +18,18 @@
     __all__ = ['python_code_with_activation_checkpoint']
 
 
-def _gen_saved_tensors_hooks():
-    """
-    Generate saved tensors hooks
-    """
-
-    pack_hook = """def pack_hook_input(self, x):
-    if getattr(x, "offload", False):
-        return (x.device, x.cpu())
-    else:
-        return x
- 
-def pack_hook_no_input(self, x):
-    if getattr(x, "offload", True):
-        return (x.device, x.cpu())
-    else:
-        return x
-"""
-
-    unpack_hook = """def unpack_hook(self, packed):
-    if isinstance(packed, tuple):
-        device, tensor = packed
-        return tensor.to(device)
-    else:
-        return packed
-"""
-
-    return pack_hook, unpack_hook
-
-
-def _gen_loop_5(to_keep):
-    context = "chunk_result = []\nfor gen_loop_idx in range(4):\n"
-    context += "    chunk_tensor = " + to_keep + "[gen_loop_idx, :]\n"
+def _gen_loop_start(to_keep, chunk_size=2):
+    context = "chunk_result = []; chunk_size = %d\nfor gen_loop_idx in range(0, %s.shape[0], chunk_size):\n" % (chunk_size, to_keep[0])
+    context += "    chunk_tensor = " + to_keep + "[gen_loop_idx:gen_loop_idx + chunk_size, :]\n"
     return context
 
 
-def _gen_loop_5_final(final_name, to_keep):
+def _gen_loop_end(final_name, to_keep):
     context = "    chunk_result.append(" + final_name + ")\n"
     context += "chunk_result = torch.cat(chunk_result, dim=0);  " + to_keep[0] + " = None\n"
     context += final_name + " = chunk_result; chunk_result = None\n"
     return context
 
-    
-def _gen_save_tensors_hooks_context(offload_input=True) -> str:
-    """Generate customized saved_tensors_hooks
-
-    Args:
-        offload_input (bool, optional): whether we need offload input, if offload_input=False, 
-        we will use self.pack_hook_no_input instead. Defaults to True.
-
-    Returns:
-        str: generated context
-    """
-
-    if offload_input:
-        context = "with torch.autograd.graph.saved_tensors_hooks(self.pack_hook_input, self.unpack_hook):\n"
-    else:
-        context = "with torch.autograd.graph.saved_tensors_hooks(self.pack_hook_no_input, self.unpack_hook):\n"
-    return context
-
-
-def _gen_save_on_cpu_context():
-    """
-    Generate save on cpu context
-    """
-
-    context = "with torch.autograd.graph.save_on_cpu(pin_memory=True):\n"
-    return context
-
 
 def _find_input_and_output_nodes(nodes: List[Node]):
     """
@@ -112,49 +57,6 @@ def _find_input_and_output_nodes(nodes: List[Node]):
     return input_nodes, output_nodes
 
 
-def _find_ckpt_regions(nodes: List[Node]):
-    """
-    Find the checkpoint regions given a list of consecutive nodes. The outputs will be list
-    of tuples, each tuple is in the form of (start_index, end_index).
-    """
-    ckpt_nodes = []
-    ckpt_regions = []
-    start = -1
-    end = -1
-    current_region = None
-
-    for idx, node in enumerate(nodes):
-        if hasattr(node, 'activation_checkpoint'):
-            act_ckpt_label = node.activation_checkpoint
-
-            # this activation checkpoint label is not set yet
-            # meaning this is the first node of the activation ckpt region
-            if current_region is None:
-                current_region = act_ckpt_label
-                start = idx
-
-            # if activation checkpoint has changed
-            # we restart the tracking
-            # e.g. node ckpt states = [ckpt1, ckpt2, ckpt2, ckpt2]
-            if act_ckpt_label != current_region:
-                assert start != -1
-                ckpt_regions.append((start, idx - 1))
-                current_region = act_ckpt_label
-                start = idx
-                end = -1
-        elif current_region is not None and not hasattr(node, 'activation_checkpoint'):
-            # used to check the case below
-            # node ckpt states = [ckpt, ckpt, non-ckpt]
-            end = idx - 1
-            assert start != -1 and end != -1
-            ckpt_regions.append((start, end))
-            start = end = -1
-            current_region = None
-        else:
-            pass
-    return ckpt_regions
-
-
 def _find_offload_regions(nodes: List[Node]):
     """This function is to find the offload regions
     In pofo algorithm, during annotation, we will annotate the offload region with the 
@@ -400,12 +302,9 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
         emit_node_func: function to emit node
         delete_unused_value_func: function to remove the unused value
     """
-    ckpt_regions = _find_nested_ckpt_regions(nodes, 0)
-    start_idx = [item[0] for item in ckpt_regions]
-    end_idx = [item[1] for item in ckpt_regions]
 
     # find the offload regions
-    chunk_regions, chunk_labels = _find_offload_regions(nodes)
+    chunk_regions = [(1, 4)]
     chunk_starts = [item[0] for item in chunk_regions]
     chunk_ends = [item[1] for item in chunk_regions]
     chunk_inputs = []
@@ -424,7 +323,7 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     # this flag is to prevent repeated insert of save tensors
     # hooks definition in ckpt_func
     node_idx = 0
-    to_keep = []
+    chunk_var = []
     while node_idx < len(node_list):
         # break if we finish the processing all the nodes
         if node_idx >= len(node_list):
@@ -435,28 +334,30 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
             node = node_list[node_idx]
 
             if node_idx in chunk_starts:
-                # save chunk input var, dont delete it
-                to_keep.extend(node.args[0].name)
                 within_chunk_region = True
-                # add for loop
-                body.append(_gen_loop_5(to_keep[0]))
-                # change first node's input to new chunked var
-                node_args = list(node.args)
-                node_args[0] = 'chunk_tensor'
 
+                # save chunk input var, dont delete it
+                chunk_var.append(node.args[0].name)
+                
+                # add for loop
+                body.append(_gen_loop_start(chunk_var[0]))
+                
             if within_chunk_region:
                 emit_node_func(node, body)
+                # replace input var with chunk var
+                if node_idx in chunk_starts:
+                    body[-1] = body[-1].replace("("+ chunk_var[0] +")", '(chunk_tensor)')
                 body[-1] = '    ' + body[-1]
-                delete_unused_value_func(node, body, to_keep)
+                delete_unused_value_func(node, body, chunk_var)
 
             else:
                 emit_node_func(node, body)
                 if node_idx not in chunk_inputs:
-                    delete_unused_value_func(node, body, to_keep)
+                    delete_unused_value_func(node, body, chunk_var)
 
             if node_idx in chunk_ends:
-                body.append(_gen_loop_5_final(node.name, to_keep))
-                to_keep = []
+                body.append(_gen_loop_end(node.name, chunk_var))
+                chunk_var = []
                 within_chunk_region = False
 
             node_idx += 1
@@ -580,9 +481,7 @@ def delete_unused_values(user: Node, body, to_keep=[]):
                     body.append('\n')
                     return
                 nodes_to_delete = user_to_last_uses.get(user, [])
-                for n in nodes_to_delete:
-                    if n.name in to_keep:
-                        nodes_to_delete.remove(n)
+                nodes_to_delete = [i for i in nodes_to_delete if i.name not in to_keep]
                 if len(nodes_to_delete):
                     to_delete_str = ' = '.join([repr(n) for n in nodes_to_delete] + ['None'])
                     body.append(f';  {to_delete_str}\n')
diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
index 69b327d4bd5b..7667fa691558 100644
--- a/chunk_codegen_run.py
+++ b/chunk_codegen_run.py
@@ -9,60 +9,39 @@
 from colossalai.utils import free_port
 from colossalai.core import global_context as gpc
 from colossalai.fx.graph_module import ColoGraphModule
-
-try:
-    from chunk_codegen import ChunkCodeGen
-    with_codegen = True
-except:
-    # fall back to older pytorch version
-    from chunk_codegen import python_code_with_activation_checkpoint
-    with_codegen = False
-
-
-class MyNet(torch.nn.Module):
-
-    def __init__(self) -> None:
-        super().__init__()
-        self.linear0 = torch.nn.Linear(4, 4)
-        self.linear1 = torch.nn.Linear(4, 4)
-        self.linear2 = torch.nn.Linear(4, 4)
-        self.linear3 = torch.nn.Linear(4, 4)
-        self.linear4 = torch.nn.Linear(4, 4)
-        self.linear5 = torch.nn.Linear(4, 4)
-        self.linear6 = torch.nn.Linear(4, 4)
-
-    def forward(self, x):
-        x = self.linear0(x)
-        x = self.linear1(x)
-        x = self.linear2(x)
-        x = self.linear3(x)
-        x = self.linear4(x)
-        x = self.linear5(x)
-        x = self.linear6(x)
-        return x
+from evoformer.evoformer import evoformer_base
+from chunk_codegen import ChunkCodeGen
+with_codegen = True
 
 
 def _is_all_gradient_close(m: torch.nn.Module, gm: GraphModule) -> bool:
     for m_p, gm_p in zip(m.parameters(), gm.parameters()):
-        if not torch.allclose(m_p.grad, gm_p.grad):
+        if m_p.grad is not None and not torch.allclose(m_p.grad, gm_p.grad):
             return False
     return True
 
 
-def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, data: torch.Tensor):
+def _is_all_param_close(m: torch.nn.Module, gm: GraphModule) -> bool:
+    for m_p, gm_p in zip(m.parameters(), gm.parameters()):
+        if m_p.grad is not None and not torch.allclose(m_p.data, gm_p.data):
+            return False
+    return True
+
 
+def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
     # test forward
-    non_fx_out = model(data)
-    fx_out = gm(data)
-    print(non_fx_out.shape, fx_out.shape)
-    assert torch.equal(non_fx_out, fx_out), "fx_out doesn't comply with original output"
+    non_fx_out = model(node.clone(), pair.clone())
+    fx_out = gm(node.clone(), pair.clone())
+    assert torch.equal(non_fx_out[0], fx_out[0]), "fx_out doesn't comply with original output"
+    assert torch.equal(non_fx_out[1], fx_out[1]), "fx_out doesn't comply with original output"
 
     # test barckward
-    loss0 = non_fx_out.sum()
-    loss0.backward()
-    loss1 = fx_out.sum()
-    loss1.backward()
-    assert _is_all_gradient_close(model, gm), "gm doesn't have the same gradient as original one"
+    # loss0 = non_fx_out[0].sum() + non_fx_out[1].sum()
+    # loss0.backward()
+    # loss1 = fx_out[0].sum() + fx_out[1].sum()
+    # loss1.backward()
+    # assert _is_all_param_close(model, gm)
+    # assert _is_all_gradient_close(model, gm), "gm doesn't have the same gradient as original one"
 
 
 def _run_offload_codegen(rank):
@@ -70,30 +49,22 @@ def _run_offload_codegen(rank):
     colossalai.launch(config={}, rank=rank, world_size=1, host='localhost', port=free_port(), backend='nccl')
 
     # build model and input
-    model = MyNet().cuda()
-    data = torch.rand(4, 4).cuda()
+    model = evoformer_base().cuda()
+    node = torch.randn(1, 16, 32, 256).cuda()
+    pair = torch.randn(1, 32, 32, 128).cuda()
 
     # trace the module and replace codegen
     tracer = ColoTracer(trace_act_ckpt=True)
     graph = tracer.trace(model)
-    codegen = ChunkCodeGen()
-    graph.set_codegen(codegen)
-
-    # annotate the activation offload part
-    # also annotate the activation_checkpoint so we could test both types
-    # of input offload
-    for node in graph.nodes:
-        if node.name == "linear0":
-            setattr(node, "activation_offload", [0, True, False])
-        if node.name == "linear1":
-            setattr(node, "activation_offload", [0, True, False])
-        # if node.name == "linear2":
-        #     setattr(node, "activation_offload", [1, True, True])
-        # if node.name == "linear4":
-        #     setattr(node, "activation_offload", [2, False, True])
-        # if node.name == "linear5":
-        #     setattr(node, "activation_checkpoint", [0])
-        #     setattr(node, "activation_offload", True)
+    # codegen = ChunkCodeGen()
+    # graph.set_codegen(codegen)
+
+    # annotate the chunk part
+    # for node in graph.nodes:
+    #     if node.name == "linear0":
+    #         setattr(node, "activation_offload", [0, True, False])
+    #     if node.name == "linear1":
+    #         setattr(node, "activation_offload", [0, True, False])
 
     gm = ColoGraphModule(copy.deepcopy(model), graph)
     gm.recompile()
@@ -102,7 +73,7 @@ def _run_offload_codegen(rank):
     code = graph.python_code("self").src
     print(code)
 
-    _test_fwd_and_bwd(model, gm, data)
+    _test_fwd_and_bwd(model, gm, node, pair)
     gpc.destroy()
 
 
diff --git a/evoformer/evoformer.py b/evoformer/evoformer.py
index ef3df2769840..0c5ab952a779 100644
--- a/evoformer/evoformer.py
+++ b/evoformer/evoformer.py
@@ -28,7 +28,7 @@ def __init__(self, d_node, d_pair):
         super(Evoformer, self).__init__()
 
         self.blocks = nn.ModuleList()
-        for _ in range(3):
+        for _ in range(1):
             self.blocks.append(EvoformerBlock(d_node, d_pair))
 
     def forward(self, node, pair):
@@ -36,6 +36,11 @@ def forward(self, node, pair):
             node, pair = b(node, pair)
         return node, pair
 
+
+def evoformer_tiny():
+    return Evoformer(d_node=64, d_pair=32)
+
+
 def evoformer_base():
     return Evoformer(d_node=256, d_pair=128)
 
diff --git a/evoformer/kernel.py b/evoformer/kernel.py
index 2655901a2fe9..26ab5dc53261 100644
--- a/evoformer/kernel.py
+++ b/evoformer/kernel.py
@@ -8,7 +8,7 @@ def bias_sigmod_ele(y, bias, z):
 
 def bias_dropout_add(x: torch.Tensor, bias: torch.Tensor, dropmask: torch.Tensor,
                      residual: torch.Tensor, prob: float) -> torch.Tensor:
-    out = (x + bias) * F.dropout(dropmask, p=prob, training=True)
+    out = (x + bias) * F.dropout(dropmask, p=prob, training=False)
     out = residual + out
     return out
 
diff --git a/evoformer/msa.py b/evoformer/msa.py
index ccefa38c48be..cac456638a55 100644
--- a/evoformer/msa.py
+++ b/evoformer/msa.py
@@ -45,7 +45,7 @@ def forward(self, M_raw, Z):
         # b = rearrange(b, 'b q k h -> b h q k')
 
         M = self.attention(M, b)
-        dropout_mask = torch.ones_like(M[:, 0:1, :, :], device=M.device, dtype=M.dtype)
+        dropout_mask = torch.ones_like(M[:, 0:1, :, :]).to(M.device).to(M.dtype)
 
         return bias_dropout_add(M, self.out_bias, dropout_mask, M_raw, prob=self.p_drop)
 
diff --git a/evoformer/triangle.py b/evoformer/triangle.py
index 7db0482f5557..f479469c3836 100644
--- a/evoformer/triangle.py
+++ b/evoformer/triangle.py
@@ -51,7 +51,7 @@ def forward(self, Z_raw):
 
         ab = torch.einsum('bikd,bjkd->bijd', left_proj_act, right_proj_act)
         ab = self.output_projection(self.layernorm2(ab))
-        dropout_mask = torch.ones_like(Z[:, 0:1, :, :], device=Z.device, dtype=Z.dtype)
+        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
         return bias_ele_dropout_residual(ab,
                                          self.output_bias,
                                          g,
@@ -97,7 +97,7 @@ def forward(self, Z_raw):
 
         ab = torch.einsum('bkid,bkjd->bijd', left_proj_act, right_proj_act)
         ab = self.output_projection(self.layernorm2(ab))
-        dropout_mask = torch.ones_like(Z[:, 0:1, :, :], device=Z.device, dtype=Z.dtype)
+        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
         return bias_ele_dropout_residual(ab,
                                          self.output_bias,
                                          g,
@@ -134,7 +134,7 @@ def forward(self, Z_raw):
 
         Z = self.attention(Z, b)
 
-        dropout_mask = torch.ones_like(Z[:, 0:1, :, :], device=Z.device, dtype=Z.dtype)
+        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
         return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
 
 
@@ -168,7 +168,7 @@ def forward(self, Z_raw):
         Z = self.attention(Z, b)
 
         Z = Z.transpose(-2, -3)
-        dropout_mask = torch.ones_like(Z[:, :, 0:1, :], device=Z.device, dtype=Z.dtype)
+        dropout_mask = torch.ones_like(Z[:, :, 0:1, :]).to(Z.device).to(Z.dtype)
         return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
 
 
From f8aeecef46461ff574f51982d03310fa8c57888e Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 3 Nov 2022 14:33:35 +0800
Subject: [PATCH 006/209] add meta

---
 chunk_codegen.py     |  3 +++
 chunk_codegen_run.py | 13 +++++++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index cb2a3a8a90ee..1f336eb2bf35 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -366,6 +366,9 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
 if CODEGEN_AVAILABLE:
 
     class ChunkCodeGen(CodeGen):
+        def __init__(self, meta_graph):
+            super().__init__()
+            self.meta_node = list(meta_graph.graph.nodes)
 
         def _gen_python_code(self, nodes, root_module: str, namespace: _Namespace) -> PythonCode:
             free_vars: List[str] = []
diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
index 7667fa691558..b875b6308f55 100644
--- a/chunk_codegen_run.py
+++ b/chunk_codegen_run.py
@@ -9,6 +9,8 @@
 from colossalai.utils import free_port
 from colossalai.core import global_context as gpc
 from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp, TensorMetadata
+from colossalai.fx.profiler import MetaTensor
 from evoformer.evoformer import evoformer_base
 from chunk_codegen import ChunkCodeGen
 with_codegen = True
@@ -56,9 +58,10 @@ def _run_offload_codegen(rank):
     # trace the module and replace codegen
     tracer = ColoTracer(trace_act_ckpt=True)
     graph = tracer.trace(model)
-    # codegen = ChunkCodeGen()
-    # graph.set_codegen(codegen)
-
+    gm_prop = torch.fx.GraphModule(model, graph)
+    interp = MetaInfoProp(gm_prop)
+    interp.propagate(MetaTensor(node, fake_device='cuda:0'), MetaTensor(pair, fake_device='cuda:0'))
+    
     # annotate the chunk part
     # for node in graph.nodes:
     #     if node.name == "linear0":
@@ -66,7 +69,9 @@ def _run_offload_codegen(rank):
     #     if node.name == "linear1":
     #         setattr(node, "activation_offload", [0, True, False])
 
-    gm = ColoGraphModule(copy.deepcopy(model), graph)
+    codegen = ChunkCodeGen(gm_prop)
+    # graph.set_codegen(codegen)
+    gm = ColoGraphModule(model, graph)
     gm.recompile()
 
     # assert we have all the components

From c35718e8db5f3fbbb5749a2a0b5f4b46241a43b1 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 4 Nov 2022 11:18:09 +0800
Subject: [PATCH 007/209] basic chunk

---
 chunk_codegen.py     | 138 +++++++++++++++++++++++++++++--------------
 chunk_codegen_run.py |   2 +-
 2 files changed, 95 insertions(+), 45 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 1f336eb2bf35..1267f64cbbb2 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -18,16 +18,61 @@
     __all__ = ['python_code_with_activation_checkpoint']
 
 
-def _gen_loop_start(to_keep, chunk_size=2):
-    context = "chunk_result = []; chunk_size = %d\nfor gen_loop_idx in range(0, %s.shape[0], chunk_size):\n" % (chunk_size, to_keep[0])
-    context += "    chunk_tensor = " + to_keep + "[gen_loop_idx:gen_loop_idx + chunk_size, :]\n"
+def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
+    new_shape = "["
+    for idx, i in enumerate(shape):
+        if idx == chunk_dim:
+            new_shape += "%s:%s + chunk_size" % (chunk_idx_name, chunk_idx_name)
+        else:
+            new_shape += ":"
+        new_shape += ", "
+    new_shape = new_shape[:-2] + "]"
+    return new_shape
+
+
+def _get_first_non_single_dim(shape):
+    for idx, i in enumerate(shape):
+        if i == 1:
+            continue
+        else:
+            return idx
+    raise RuntimeError("can not get first non single dim for shape", shape)
+
+
+def _gen_loop_start(chunk_input_meta, chunk_output, chunk_size=2):
+    if len(chunk_input_meta) == 1:
+        node = chunk_input_meta[0]
+        node_shape = node.meta['tensor_meta'].shape
+        chunk_dim = _get_first_non_single_dim(node_shape)
+        chunk_slice = _gen_chunk_slice_dim(chunk_dim, "gen_chunk_idx", node_shape)
+        out_shape = str(list(chunk_output.meta['tensor_meta'].shape))
+        
+        context = "chunk_result = torch.empty(%s, dtype=%s.dtype, device=%s.device); chunk_size = %d\nfor gen_chunk_idx in range" % (
+            out_shape, node.name, node.name, chunk_size)
+        context += "(0, %s.shape[%d], chunk_size):\n" % (node.name, chunk_dim)
+        context += "    chunk_tensor = %s%s\n" % (node.name, chunk_slice)
+    else:
+        raise NotImplementedError("input with size %d not implemented" % len(chunk_input_meta))
     return context
 
 
-def _gen_loop_end(final_name, to_keep):
-    context = "    chunk_result.append(" + final_name + ")\n"
-    context += "chunk_result = torch.cat(chunk_result, dim=0);  " + to_keep[0] + " = None\n"
-    context += final_name + " = chunk_result; chunk_result = None\n"
+def _gen_loop_end(chunk_outputs, chunk_inputs, node_list):
+    chunk_inputs_name = chunk_inputs[0].name
+    chunk_outputs_name = chunk_outputs.name
+    chunk_outputs_idx = _find_idx_by_name(chunk_outputs_name, node_list)
+    chunk_output_shape = chunk_outputs.meta['tensor_meta'].shape
+    chunk_dim = _get_first_non_single_dim(chunk_output_shape)
+    chunk_slice = _gen_chunk_slice_dim(chunk_dim, "gen_chunk_idx", chunk_output_shape)
+    context = "    chunk_result%s = %s\n" % (chunk_slice, chunk_outputs_name)
+
+    context += chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None"
+    
+    # determine if its the last use for chunk input
+    users_name = list(chunk_inputs[0].users.keys())
+    if all([_find_idx_by_name(user.name, node_list) <= chunk_outputs_idx for user in users_name]):
+        context += ";  %s = None" % chunk_inputs_name
+
+    context += "\n"
     return context
 
 
@@ -44,7 +89,7 @@ def _find_input_and_output_nodes(nodes: List[Node]):
         for input_node in node._input_nodes.keys():
             node_repr = repr(input_node)
             if input_node not in nodes and node_repr not in input_nodes:
-                input_nodes.append(node_repr)
+                input_nodes.append(input_node)
 
     # if a node has a user node which is not in the node list
     # we treat that user node as the node receiving the current node output
@@ -52,11 +97,18 @@ def _find_input_and_output_nodes(nodes: List[Node]):
         for output_node in node.users.keys():
             node_repr = repr(node)
             if output_node not in nodes and node_repr not in output_nodes:
-                output_nodes.append(node_repr)
+                output_nodes.append(output_node)
 
     return input_nodes, output_nodes
 
 
+def _find_idx_by_name(name, nodes_list):
+    for idx, node in enumerate(nodes_list):
+        if node.name == name:
+            return idx
+    raise RuntimeError("name %s not found in node list" % name)
+        
+
 def _find_offload_regions(nodes: List[Node]):
     """This function is to find the offload regions
     In pofo algorithm, during annotation, we will annotate the offload region with the 
@@ -290,7 +342,7 @@ def emit_ckpt_func(body,
             body.append(usage)
 
 
-def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_value_func):
+def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_value_func, meta_nodes):
     """Emit code with nested activation checkpoint
     When we detect some of the node.activation_checkpoint is a List, we will use
     this function to emit the activation checkpoint codes.
@@ -304,7 +356,7 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     """
 
     # find the offload regions
-    chunk_regions = [(1, 4)]
+    chunk_regions = [(2, 5)]
     chunk_starts = [item[0] for item in chunk_regions]
     chunk_ends = [item[1] for item in chunk_regions]
     chunk_inputs = []
@@ -319,48 +371,46 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
         inputs, outputs = _find_input_and_output_nodes(offload_node_list)
         chunk_inputs.append(inputs)
         chunk_outputs.append(outputs)
-
+    chunk_inputs_idx = [[_find_idx_by_name(j.name, node_list) for j in i] for i in chunk_inputs]
+    chunk_outputs_idx = [[_find_idx_by_name(j.name, node_list) for j in i] for i in chunk_outputs]
+    chunk_inputs_names = []
+    for i in chunk_inputs:
+        for j in i:
+            chunk_inputs_names.append(j.name)
+    
     # this flag is to prevent repeated insert of save tensors
     # hooks definition in ckpt_func
     node_idx = 0
-    chunk_var = []
+    region_idx = 0
     while node_idx < len(node_list):
-        # break if we finish the processing all the nodes
-        if node_idx >= len(node_list):
-            break
+        node = node_list[node_idx]
 
-        # process node in forward function
-        else:
-            node = node_list[node_idx]
+        if node_idx in chunk_starts:
+            within_chunk_region = True
+                
+            # add for loop
+            chunk_input_meta = [meta_nodes[i] for i in chunk_inputs_idx[region_idx]]
+            body.append(_gen_loop_start(chunk_input_meta, node_list[chunk_ends[region_idx]]))
 
+        if within_chunk_region:
+            emit_node_func(node, body)
+            # replace input var with chunk var
             if node_idx in chunk_starts:
-                within_chunk_region = True
-
-                # save chunk input var, dont delete it
-                chunk_var.append(node.args[0].name)
-                
-                # add for loop
-                body.append(_gen_loop_start(chunk_var[0]))
-                
-            if within_chunk_region:
-                emit_node_func(node, body)
-                # replace input var with chunk var
-                if node_idx in chunk_starts:
-                    body[-1] = body[-1].replace("("+ chunk_var[0] +")", '(chunk_tensor)')
-                body[-1] = '    ' + body[-1]
-                delete_unused_value_func(node, body, chunk_var)
+                body[-1] = body[-1].replace("("+ chunk_inputs[region_idx][0].name +")", '(chunk_tensor)')
+            body[-1] = '    ' + body[-1]
+            delete_unused_value_func(node, body, chunk_inputs_names)
 
-            else:
-                emit_node_func(node, body)
-                if node_idx not in chunk_inputs:
-                    delete_unused_value_func(node, body, chunk_var)
+        else:
+            emit_node_func(node, body)
+            if node_idx not in chunk_inputs:
+                delete_unused_value_func(node, body, chunk_inputs_names)
 
-            if node_idx in chunk_ends:
-                body.append(_gen_loop_end(node.name, chunk_var))
-                chunk_var = []
-                within_chunk_region = False
+        if node_idx in chunk_ends:
+            body.append(_gen_loop_end(node, chunk_inputs[region_idx], node_list))
+            within_chunk_region = False
+            region_idx += 1
 
-            node_idx += 1
+        node_idx += 1
 
 
 if CODEGEN_AVAILABLE:
@@ -562,7 +612,7 @@ def emit_node(node: Node, body):
 
             # if any node has a list of labels for activation_checkpoint, we
             # will use nested type of activation checkpoint codegen
-            emit_code_with_chunk(body, ckpt_func, nodes, emit_node, delete_unused_values)
+            emit_code_with_chunk(body, ckpt_func, nodes, emit_node, delete_unused_values, self.meta_node)
 
             if len(body) == 0:
                 # If the Graph has no non-placeholder nodes, no lines for the body
diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
index b875b6308f55..547b983a9c0c 100644
--- a/chunk_codegen_run.py
+++ b/chunk_codegen_run.py
@@ -70,7 +70,7 @@ def _run_offload_codegen(rank):
     #         setattr(node, "activation_offload", [0, True, False])
 
     codegen = ChunkCodeGen(gm_prop)
-    # graph.set_codegen(codegen)
+    graph.set_codegen(codegen)
     gm = ColoGraphModule(model, graph)
     gm.recompile()
 

From d95cfe26222427e483df7f23f4bb208cec6ae4c3 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 7 Nov 2022 18:26:13 +0800
Subject: [PATCH 008/209] basic memory

---
 chunk_codegen.py     | 83 ++++++++++++++++++++++++++++++++++++++++++--
 chunk_codegen_run.py | 20 +++++------
 2 files changed, 90 insertions(+), 13 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 1267f64cbbb2..4ca33a4d5914 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -6,6 +6,7 @@
 try:
     from torch.fx.node import Node, Argument, map_arg, _type_repr, _get_qualified_name
     from torch.fx.graph import _Namespace, PythonCode, _custom_builtins, _is_from_torch, _format_target, magic_methods, CodeGen, _origin_type_map, inplace_methods, _CustomBuiltin
+    from colossalai.fx.profiler import calculate_fwd_out, calculate_fwd_tmp, parameter_size, activation_size
     CODEGEN_AVAILABLE = True
 except:
     from torch.fx.graph import _Namespace, PythonCode, _custom_builtins, _is_from_torch, _format_target, magic_methods, _origin_type_map, _format_args, _CustomBuiltin
@@ -18,6 +19,82 @@
     __all__ = ['python_code_with_activation_checkpoint']
 
 
+def _get_meta_node_size(x):
+    x = x.meta['tensor_meta']
+    x = x.numel * torch.tensor([], dtype=x.dtype).element_size()
+    return x
+
+
+def _get_output_node_size(n):
+    fwd_out = {x.uuid: x for x in n.meta["fwd_out"] if isinstance(x, torch.Tensor) and hasattr(x, 'uuid')}
+    return activation_size(fwd_out)
+
+
+def _get_delete_node_size(user, user_to_last_uses):
+    if user.op in ('placeholder', 'output'):
+        return 0
+    nodes_to_delete = user_to_last_uses.get(user, [])
+    if len(nodes_to_delete):
+        delete_size = sum([_get_output_node_size(i) for i in nodes_to_delete])
+        return delete_size
+    return 0
+
+
+def _get_last_usr(nodes):
+    node_to_last_use: Dict[Node, Node] = {}
+    user_to_last_uses: Dict[Node, List[Node]] = {}
+
+    def register_last_uses(n: Node, user: Node):
+        if n not in node_to_last_use:
+            node_to_last_use[n] = user
+            user_to_last_uses.setdefault(user, []).append(n)
+
+    for node in reversed(nodes):
+        map_arg(node.args, lambda n: register_last_uses(n, node))
+        map_arg(node.kwargs, lambda n: register_last_uses(n, node))
+    return user_to_last_uses
+
+
+def _estimate_inference_mem(gm: torch.fx.GraphModule):
+    act_memory = 0
+    act_memory_peak_log = []
+    act_memory_after_node_log = []
+    user_to_last_uses = _get_last_usr(list(gm.graph.nodes))
+    for node in gm.graph.nodes:
+        # if node is placeholder, just add the size of the node
+        if node.op == 'placeholder':
+            act_memory += _get_meta_node_size(node)
+        # skip output
+        elif node.op == 'output':
+            continue
+        # node is an operation, calculate tmp, output node and delete node memory
+        else:
+            # forward memory
+            act_memory += calculate_fwd_tmp(node)
+            # act_memory += calculate_fwd_out(node)
+            act_memory += _get_output_node_size(node)
+            # record max act memory
+            act_memory_peak_log.append(act_memory)
+            # delete useless memory
+            act_memory -= calculate_fwd_tmp(node)
+            act_memory -= _get_delete_node_size(node, user_to_last_uses)
+            act_memory_after_node_log.append(act_memory)
+
+    act_memory_peak_log = [float(i) / (1024 ** 2) for i in act_memory_peak_log]
+    param_memory = parameter_size(gm)
+    return (act_memory + param_memory) / (1024 ** 2), param_memory / (1024 ** 2)
+
+
+def _estimate_chunk_forward_mem(gm: torch.fx.GraphModule, start_node, end_node, chunk_size):
+    node_size = 0
+    param_size = 0
+    for node in gm.graph.nodes:
+        node_size += calculate_fwd_tmp(node)
+        node_size += calculate_fwd_out(node)
+    param_size = parameter_size(gm)
+    return (node_size + param_size) / 1024**2, param_size / 1024**2
+
+
 def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
     new_shape = "["
     for idx, i in enumerate(shape):
@@ -342,7 +419,7 @@ def emit_ckpt_func(body,
             body.append(usage)
 
 
-def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_value_func, meta_nodes):
+def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_value_func, meta_nodes, meta_graph):
     """Emit code with nested activation checkpoint
     When we detect some of the node.activation_checkpoint is a List, we will use
     this function to emit the activation checkpoint codes.
@@ -364,6 +441,7 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     within_chunk_region = False
 
     node_list = list(nodes)
+    _estimate_inference_mem(meta_graph)
 
     # find the input and output var names for each offload region
     for idx, (start, end) in enumerate(chunk_regions):
@@ -418,6 +496,7 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     class ChunkCodeGen(CodeGen):
         def __init__(self, meta_graph):
             super().__init__()
+            self.meta_graph = meta_graph
             self.meta_node = list(meta_graph.graph.nodes)
 
         def _gen_python_code(self, nodes, root_module: str, namespace: _Namespace) -> PythonCode:
@@ -612,7 +691,7 @@ def emit_node(node: Node, body):
 
             # if any node has a list of labels for activation_checkpoint, we
             # will use nested type of activation checkpoint codegen
-            emit_code_with_chunk(body, ckpt_func, nodes, emit_node, delete_unused_values, self.meta_node)
+            emit_code_with_chunk(body, ckpt_func, nodes, emit_node, delete_unused_values, self.meta_node, self.meta_graph)
 
             if len(body) == 0:
                 # If the Graph has no non-placeholder nodes, no lines for the body
diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
index 547b983a9c0c..1ab7d958b0a9 100644
--- a/chunk_codegen_run.py
+++ b/chunk_codegen_run.py
@@ -2,6 +2,7 @@
 import torch
 import torch.nn.functional as F
 import pytest
+import torch.fx
 import torch.multiprocessing as mp
 from torch.fx import GraphModule
 from colossalai.fx import ColoTracer
@@ -56,18 +57,15 @@ def _run_offload_codegen(rank):
     pair = torch.randn(1, 32, 32, 128).cuda()
 
     # trace the module and replace codegen
-    tracer = ColoTracer(trace_act_ckpt=True)
-    graph = tracer.trace(model)
-    gm_prop = torch.fx.GraphModule(model, graph)
-    interp = MetaInfoProp(gm_prop)
+    graph = ColoTracer().trace(model, meta_args={'node': node.to(torch.device('meta')), 'pair': pair.to(torch.device('meta'))})
+    gm_prop = torch.fx.symbolic_trace(model) # must use symbolic_trace
+    interp = MetaInfoProp(gm_prop) 
+    interp.propagate(MetaTensor(node, fake_device='cuda:0'), MetaTensor(pair, fake_device='cuda:0'))
+
+    # now run it twice to get meta info in graph module, not necessary
+    gm = torch.fx.GraphModule(model, graph)
+    interp = MetaInfoProp(gm)
     interp.propagate(MetaTensor(node, fake_device='cuda:0'), MetaTensor(pair, fake_device='cuda:0'))
-    
-    # annotate the chunk part
-    # for node in graph.nodes:
-    #     if node.name == "linear0":
-    #         setattr(node, "activation_offload", [0, True, False])
-    #     if node.name == "linear1":
-    #         setattr(node, "activation_offload", [0, True, False])
 
     codegen = ChunkCodeGen(gm_prop)
     graph.set_codegen(codegen)

From 12301dd2e9a1889fe76c6ab719aff1404e92aea0 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 8 Nov 2022 10:34:14 +0800
Subject: [PATCH 009/209] finish basic inference memory estimation

---
 chunk_codegen.py     | 11 +++++++++++
 chunk_codegen_run.py | 14 ++++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 4ca33a4d5914..01b29cb33d43 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -64,6 +64,8 @@ def _estimate_inference_mem(gm: torch.fx.GraphModule):
         # if node is placeholder, just add the size of the node
         if node.op == 'placeholder':
             act_memory += _get_meta_node_size(node)
+            act_memory_peak_log.append(act_memory)
+            act_memory_after_node_log.append(act_memory)
         # skip output
         elif node.op == 'output':
             continue
@@ -81,6 +83,15 @@ def _estimate_inference_mem(gm: torch.fx.GraphModule):
             act_memory_after_node_log.append(act_memory)
 
     act_memory_peak_log = [float(i) / (1024 ** 2) for i in act_memory_peak_log]
+    act_memory_after_node_log = [float(i) / (1024 ** 2) for i in act_memory_after_node_log]
+
+    # for i in act_memory_peak_log:
+    #     print("%.2f " % i, end='')
+    # print("\n")
+    # for i in act_memory_after_node_log:
+    #     print("%.2f " % i, end='')
+    # print("\n")
+    
     param_memory = parameter_size(gm)
     return (act_memory + param_memory) / (1024 ** 2), param_memory / (1024 ** 2)
 
diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
index 1ab7d958b0a9..cc975f2eaf84 100644
--- a/chunk_codegen_run.py
+++ b/chunk_codegen_run.py
@@ -32,9 +32,19 @@ def _is_all_param_close(m: torch.nn.Module, gm: GraphModule) -> bool:
 
 
 def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
+    # now_mem = torch.cuda.memory_allocated() / 1024**2
+    # max_mem = torch.cuda.max_memory_allocated() / 1024**2
+    # print("now:%.2f max:%.2f" %(torch.cuda.memory_allocated() / 1024**2, torch.cuda.max_memory_allocated() / 1024**2))
+    # with torch.no_grad():
+    #     fx_out = gm(node, pair)
+    # new_now_mem = torch.cuda.memory_allocated() / 1024**2
+    # new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
+    # print("now:%.2f max:%.2f" %(new_now_mem - now_mem, new_max_mem - max_mem))
+    
     # test forward
-    non_fx_out = model(node.clone(), pair.clone())
-    fx_out = gm(node.clone(), pair.clone())
+    with torch.no_grad():
+        non_fx_out = model(node, pair)
+        fx_out = gm(node, pair)
     assert torch.equal(non_fx_out[0], fx_out[0]), "fx_out doesn't comply with original output"
     assert torch.equal(non_fx_out[1], fx_out[1]), "fx_out doesn't comply with original output"
 

From 8cca684c5684ffb0ac0b68d63df3cbde848d3d08 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 8 Nov 2022 14:41:57 +0800
Subject: [PATCH 010/209] finish memory estimation

---
 chunk_codegen.py | 103 ++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 88 insertions(+), 15 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 01b29cb33d43..baf207795b60 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -85,25 +85,97 @@ def _estimate_inference_mem(gm: torch.fx.GraphModule):
     act_memory_peak_log = [float(i) / (1024 ** 2) for i in act_memory_peak_log]
     act_memory_after_node_log = [float(i) / (1024 ** 2) for i in act_memory_after_node_log]
 
-    # for i in act_memory_peak_log:
-    #     print("%.2f " % i, end='')
-    # print("\n")
-    # for i in act_memory_after_node_log:
-    #     print("%.2f " % i, end='')
-    # print("\n")
+    print("no chunk")
+    _print_mem_log(act_memory_peak_log, "peak")
+    _print_mem_log(act_memory_after_node_log, "after")
     
     param_memory = parameter_size(gm)
     return (act_memory + param_memory) / (1024 ** 2), param_memory / (1024 ** 2)
 
 
-def _estimate_chunk_forward_mem(gm: torch.fx.GraphModule, start_node, end_node, chunk_size):
-    node_size = 0
-    param_size = 0
-    for node in gm.graph.nodes:
-        node_size += calculate_fwd_tmp(node)
-        node_size += calculate_fwd_out(node)
-    param_size = parameter_size(gm)
-    return (node_size + param_size) / 1024**2, param_size / 1024**2
+def _get_chunk_ratio(node, chunk_dim, chunk_size):
+    shape = node.meta['tensor_meta'].shape
+    chunk_ratio = float(chunk_size) / shape[chunk_dim]
+    return chunk_ratio
+
+
+def _get_chunk_delete_node_size(user, user_to_last_uses, chunk_ratio, node_list, start_node, end_node):
+    if user.op in ('placeholder', 'output'):
+        return 0
+    nodes_to_delete = user_to_last_uses.get(user, [])
+    delete_size = 0
+    for n in nodes_to_delete:
+        node_idx = _find_idx_by_name(n.name, node_list)
+        if start_node <= node_idx < end_node:
+            delete_size += _get_output_node_size(n) * chunk_ratio
+    return delete_size
+
+
+def _print_mem_log(log, title=None):
+    if title:
+        print("%-8s" % title, end=' ')
+    for i in log:
+        print("%.2f " % i, end='')
+    print("")
+
+
+def _estimate_chunk_inference_mem(gm: torch.fx.GraphModule, start_nodes, end_nodes, chunk_dims, chunk_sizes):
+    act_memory = 0
+    act_memory_peak_log = []
+    act_memory_after_node_log = []
+    user_to_last_uses = _get_last_usr(list(gm.graph.nodes))
+    within_chunk = False
+    region_idx = 0
+    chunk_ratio = 1 # use it to estimate chunk mem
+    node_list = list(gm.graph.nodes)
+
+    for idx, node in enumerate(node_list):
+        # if node in chunk start nodes, change chunk ratio and add chunk_tensor
+        if idx in start_nodes:
+            within_chunk = True
+            chunk_ratio = _get_chunk_ratio(node, chunk_dims[region_idx], chunk_sizes[region_idx])
+            act_memory += _get_output_node_size(node_list[end_nodes[region_idx]])
+            
+        # if node is placeholder, just add the size of the node
+        if node.op == 'placeholder':
+            act_memory += _get_meta_node_size(node) * chunk_ratio
+            act_memory_peak_log.append(act_memory)
+        # skip output
+        elif node.op == 'output':
+            continue
+        # node is an operation, calculate tmp, output node and delete node memory
+        else:
+            # forward memory
+            act_memory += calculate_fwd_tmp(node) * chunk_ratio
+            # act_memory += calculate_fwd_out(node)
+            act_memory += _get_output_node_size(node) * chunk_ratio
+            # record max act memory
+            act_memory_peak_log.append(act_memory)
+            # delete useless memory
+            act_memory -= calculate_fwd_tmp(node) * chunk_ratio
+            if within_chunk:
+                act_memory -= _get_chunk_delete_node_size(
+                    node, user_to_last_uses, chunk_ratio, node_list, start_nodes[region_idx], end_nodes[region_idx])
+            else:
+                act_memory -= _get_delete_node_size(node, user_to_last_uses)
+            
+        if idx in end_nodes:
+            act_memory -= _get_output_node_size(node) * chunk_ratio
+            within_chunk = False
+            chunk_ratio = 1
+            region_idx += 1
+        
+        act_memory_after_node_log.append(act_memory)
+
+    act_memory_peak_log = [float(i) / (1024 ** 2) for i in act_memory_peak_log]
+    act_memory_after_node_log = [float(i) / (1024 ** 2) for i in act_memory_after_node_log]
+
+    print("chunk")
+    _print_mem_log(act_memory_peak_log, "peak")
+    _print_mem_log(act_memory_after_node_log, "after")
+    
+    param_memory = parameter_size(gm)
+    return (act_memory + param_memory) / (1024 ** 2), param_memory / (1024 ** 2)
 
 
 def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
@@ -444,7 +516,7 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     """
 
     # find the offload regions
-    chunk_regions = [(2, 5)]
+    chunk_regions = [(2, 6)]
     chunk_starts = [item[0] for item in chunk_regions]
     chunk_ends = [item[1] for item in chunk_regions]
     chunk_inputs = []
@@ -452,6 +524,7 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     within_chunk_region = False
 
     node_list = list(nodes)
+    _estimate_chunk_inference_mem(meta_graph, chunk_starts, chunk_ends, [1], [2])
     _estimate_inference_mem(meta_graph)
 
     # find the input and output var names for each offload region

From 22f9c60b6bea147c38127f5a4420a91ab73dc84b Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Wed, 9 Nov 2022 17:50:39 +0800
Subject: [PATCH 011/209] fix bug

---
 evoformer/ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evoformer/ops.py b/evoformer/ops.py
index ddbba441dd5f..611b7b0fe777 100755
--- a/evoformer/ops.py
+++ b/evoformer/ops.py
@@ -147,7 +147,7 @@ def forward(self, in_data, nonbatched_bias=None):
 
         q = self.to_q(in_data)
         k = self.to_k(in_data)
-        v = self.to_k(in_data)
+        v = self.to_v(in_data)
 
         # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head),
         #               [q, k, v])

From d7634af5c031aa9f4faaf6ee5ea0c1662d6c6f25 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 11 Nov 2022 15:43:03 +0800
Subject: [PATCH 012/209] finish memory estimation

---
 chunk_codegen.py     | 107 ++++++++++++++++++++++++++++---------------
 chunk_codegen_run.py |  20 ++++----
 2 files changed, 80 insertions(+), 47 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index baf207795b60..c8bb433ef6b5 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -55,15 +55,49 @@ def register_last_uses(n: Node, user: Node):
     return user_to_last_uses
 
 
+def _delete_free_var_from_last_use(user_to_last_uses):
+    for key, value in user_to_last_uses.items():
+        for n in value:
+            if n.op == 'placeholder':
+                user_to_last_uses[key].remove(n)
+
+
+def _get_contiguous_memory(node, not_contiguous_list, delete=False):
+    mem = 0
+    not_contiguous_ops = ['transpose', 'permute']
+
+    if node.op == 'call_function' and 'matmul' in node.name:
+        for n in node.args:
+            if n in not_contiguous_list:
+                # matmul won't change origin tensor, but create a tmp copy
+                mem += _get_output_node_size(n)
+    elif node.op == 'call_module':
+        for n in node.args:
+            if n in not_contiguous_list:
+                # module will just make origin tensor to contiguous
+                if delete:
+                    not_contiguous_list.remove(n)
+    elif node.op == 'call_method' and any(i in node.name for i in not_contiguous_ops):
+        if node not in not_contiguous_list:
+            not_contiguous_list.append(node)
+    elif any(i in node.args for i in not_contiguous_list):
+        if node not in not_contiguous_list:
+            not_contiguous_list.append(node)
+
+    return mem
+
+
 def _estimate_inference_mem(gm: torch.fx.GraphModule):
-    act_memory = 0
+    act_memory = 0.0
     act_memory_peak_log = []
     act_memory_after_node_log = []
+    not_contiguous_list = []
     user_to_last_uses = _get_last_usr(list(gm.graph.nodes))
+    _delete_free_var_from_last_use(user_to_last_uses)
     for node in gm.graph.nodes:
         # if node is placeholder, just add the size of the node
         if node.op == 'placeholder':
-            act_memory += _get_meta_node_size(node)
+            act_memory += _get_meta_node_size(node) / (1024 ** 2)
             act_memory_peak_log.append(act_memory)
             act_memory_after_node_log.append(act_memory)
         # skip output
@@ -72,25 +106,21 @@ def _estimate_inference_mem(gm: torch.fx.GraphModule):
         # node is an operation, calculate tmp, output node and delete node memory
         else:
             # forward memory
-            act_memory += calculate_fwd_tmp(node)
-            # act_memory += calculate_fwd_out(node)
-            act_memory += _get_output_node_size(node)
+            act_memory += _get_contiguous_memory(node, not_contiguous_list) / (1024 ** 2)
+            act_memory += _get_output_node_size(node) / (1024 ** 2)
             # record max act memory
             act_memory_peak_log.append(act_memory)
             # delete useless memory
-            act_memory -= calculate_fwd_tmp(node)
-            act_memory -= _get_delete_node_size(node, user_to_last_uses)
+            act_memory -= _get_delete_node_size(node, user_to_last_uses) / (1024 ** 2)
+            act_memory -= _get_contiguous_memory(node, not_contiguous_list, delete=True) / (1024 ** 2)
             act_memory_after_node_log.append(act_memory)
 
-    act_memory_peak_log = [float(i) / (1024 ** 2) for i in act_memory_peak_log]
-    act_memory_after_node_log = [float(i) / (1024 ** 2) for i in act_memory_after_node_log]
-
     print("no chunk")
-    _print_mem_log(act_memory_peak_log, "peak")
-    _print_mem_log(act_memory_after_node_log, "after")
+    _print_mem_log(act_memory_peak_log, list(gm.graph.nodes), "peak")
+    _print_mem_log(act_memory_after_node_log, list(gm.graph.nodes), "after")
     
     param_memory = parameter_size(gm)
-    return (act_memory + param_memory) / (1024 ** 2), param_memory / (1024 ** 2)
+    return act_memory + param_memory, param_memory
 
 
 def _get_chunk_ratio(node, chunk_dim, chunk_size):
@@ -111,19 +141,23 @@ def _get_chunk_delete_node_size(user, user_to_last_uses, chunk_ratio, node_list,
     return delete_size
 
 
-def _print_mem_log(log, title=None):
+def _print_mem_log(log, nodes, title=None):
     if title:
-        print("%-8s" % title, end=' ')
-    for i in log:
-        print("%.2f " % i, end='')
-    print("")
+        print(title)
+    for idx, (l, n) in enumerate(zip(log, nodes)):
+        print("%s:%.2f \t" % (n.name, l), end='')
+        if (idx + 1) % 3 == 0:
+            print("")
+    print("\n")
 
 
 def _estimate_chunk_inference_mem(gm: torch.fx.GraphModule, start_nodes, end_nodes, chunk_dims, chunk_sizes):
-    act_memory = 0
+    act_memory = 0.0
     act_memory_peak_log = []
     act_memory_after_node_log = []
+    not_contiguous_list = []
     user_to_last_uses = _get_last_usr(list(gm.graph.nodes))
+    _delete_free_var_from_last_use(user_to_last_uses)
     within_chunk = False
     region_idx = 0
     chunk_ratio = 1 # use it to estimate chunk mem
@@ -134,11 +168,11 @@ def _estimate_chunk_inference_mem(gm: torch.fx.GraphModule, start_nodes, end_nod
         if idx in start_nodes:
             within_chunk = True
             chunk_ratio = _get_chunk_ratio(node, chunk_dims[region_idx], chunk_sizes[region_idx])
-            act_memory += _get_output_node_size(node_list[end_nodes[region_idx]])
+            act_memory += _get_output_node_size(node_list[end_nodes[region_idx]]) / (1024 ** 2)
             
         # if node is placeholder, just add the size of the node
         if node.op == 'placeholder':
-            act_memory += _get_meta_node_size(node) * chunk_ratio
+            act_memory += _get_meta_node_size(node) * chunk_ratio / (1024 ** 2)
             act_memory_peak_log.append(act_memory)
         # skip output
         elif node.op == 'output':
@@ -146,36 +180,33 @@ def _estimate_chunk_inference_mem(gm: torch.fx.GraphModule, start_nodes, end_nod
         # node is an operation, calculate tmp, output node and delete node memory
         else:
             # forward memory
-            act_memory += calculate_fwd_tmp(node) * chunk_ratio
-            # act_memory += calculate_fwd_out(node)
-            act_memory += _get_output_node_size(node) * chunk_ratio
+            act_memory += _get_contiguous_memory(node, not_contiguous_list) * chunk_ratio / (1024 ** 2)
+            act_memory += _get_output_node_size(node) * chunk_ratio / (1024 ** 2)
             # record max act memory
             act_memory_peak_log.append(act_memory)
             # delete useless memory
-            act_memory -= calculate_fwd_tmp(node) * chunk_ratio
+            act_memory -= _get_contiguous_memory(node, not_contiguous_list, delete=True) * chunk_ratio / (1024 ** 2)
             if within_chunk:
                 act_memory -= _get_chunk_delete_node_size(
-                    node, user_to_last_uses, chunk_ratio, node_list, start_nodes[region_idx], end_nodes[region_idx])
+                    node, user_to_last_uses, chunk_ratio, node_list, 
+                    start_nodes[region_idx], end_nodes[region_idx]) / (1024 ** 2)
             else:
-                act_memory -= _get_delete_node_size(node, user_to_last_uses)
+                act_memory -= _get_delete_node_size(node, user_to_last_uses) / (1024 ** 2)
             
         if idx in end_nodes:
-            act_memory -= _get_output_node_size(node) * chunk_ratio
+            act_memory -= _get_output_node_size(node) * chunk_ratio / (1024 ** 2)
             within_chunk = False
             chunk_ratio = 1
             region_idx += 1
         
         act_memory_after_node_log.append(act_memory)
 
-    act_memory_peak_log = [float(i) / (1024 ** 2) for i in act_memory_peak_log]
-    act_memory_after_node_log = [float(i) / (1024 ** 2) for i in act_memory_after_node_log]
-
     print("chunk")
-    _print_mem_log(act_memory_peak_log, "peak")
-    _print_mem_log(act_memory_after_node_log, "after")
-    
+    _print_mem_log(act_memory_peak_log, node_list, "peak")
+    _print_mem_log(act_memory_after_node_log, node_list, "after")
+
     param_memory = parameter_size(gm)
-    return (act_memory + param_memory) / (1024 ** 2), param_memory / (1024 ** 2)
+    return act_memory + param_memory, param_memory
 
 
 def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
@@ -516,7 +547,7 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     """
 
     # find the offload regions
-    chunk_regions = [(2, 6)]
+    chunk_regions = [(58, 62)]
     chunk_starts = [item[0] for item in chunk_regions]
     chunk_ends = [item[1] for item in chunk_regions]
     chunk_inputs = []
@@ -683,7 +714,9 @@ def register_last_uses(n: Node, user: Node):
             for node in reversed(nodes):
                 map_arg(node.args, lambda n: register_last_uses(n, node))
                 map_arg(node.kwargs, lambda n: register_last_uses(n, node))
-
+            
+            _delete_free_var_from_last_use(user_to_last_uses)
+            
             # NOTE: we add a variable to distinguish body and ckpt_func
             def delete_unused_values(user: Node, body, to_keep=[]):
                 """
diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
index cc975f2eaf84..39363a80abcb 100644
--- a/chunk_codegen_run.py
+++ b/chunk_codegen_run.py
@@ -32,14 +32,14 @@ def _is_all_param_close(m: torch.nn.Module, gm: GraphModule) -> bool:
 
 
 def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
-    # now_mem = torch.cuda.memory_allocated() / 1024**2
-    # max_mem = torch.cuda.max_memory_allocated() / 1024**2
-    # print("now:%.2f max:%.2f" %(torch.cuda.memory_allocated() / 1024**2, torch.cuda.max_memory_allocated() / 1024**2))
-    # with torch.no_grad():
-    #     fx_out = gm(node, pair)
-    # new_now_mem = torch.cuda.memory_allocated() / 1024**2
-    # new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
-    # print("now:%.2f max:%.2f" %(new_now_mem - now_mem, new_max_mem - max_mem))
+    now_mem = torch.cuda.memory_allocated() / 1024**2
+    with torch.no_grad():
+        node0 = node.clone()
+        pair0 = pair.clone()
+        node1, pair1 = gm(node0, pair0)        
+    new_now_mem = torch.cuda.memory_allocated() / 1024**2
+    new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
+    print("now:%.2f max:%.2f" %(new_now_mem - now_mem, new_max_mem - now_mem))
     
     # test forward
     with torch.no_grad():
@@ -63,8 +63,8 @@ def _run_offload_codegen(rank):
 
     # build model and input
     model = evoformer_base().cuda()
-    node = torch.randn(1, 16, 32, 256).cuda()
-    pair = torch.randn(1, 32, 32, 128).cuda()
+    node = torch.randn(1, 100, 300, 256).cuda()
+    pair = torch.randn(1, 300, 300, 128).cuda()
 
     # trace the module and replace codegen
     graph = ColoTracer().trace(model, meta_args={'node': node.to(torch.device('meta')), 'pair': pair.to(torch.device('meta'))})

From 1607d04e81530a3de96ce064b961c2b10ed7067a Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 14 Nov 2022 16:02:47 +0800
Subject: [PATCH 013/209] add part of index tracer

---
 chunk_codegen.py | 119 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index c8bb433ef6b5..4b8882afc105 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -19,6 +19,123 @@
     __all__ = ['python_code_with_activation_checkpoint']
 
 
+class NodeIndexTracer(object):
+    def __init__(self, gm) -> None:
+        self.gm = gm
+        self.nodes_list = list(gm.graph.nodes)
+        self.idx_trace_list = [{'idx': [], 'compute': []} for _ in range(len(self.nodes_list))] 
+        self.idx_trace_equal = []
+        self.idx_count = 1
+
+    def add_index(self):
+        self.idx_count += 1
+        return self.idx_count - 1
+
+    def inherit_computation(self, node_from, node_to):
+        _, compute_from = self.find_trace_from_node(node_from)
+        idx_to, compute_to = self.find_trace_from_node(node_to)
+        for i in compute_from:
+            if i in idx_to:
+                compute_to.append(i)
+    
+    def mark_idx_equal(self, idx1, idx2):
+        self.idx_trace_equal.append((idx1, idx2))
+        
+    def mark_computation(self, node, idx, dim):
+        input_node_idx_trace = self.find_idx_trace_from_node(node)
+        if isinstance(dim, int):
+            dim = [dim]
+        for d in dim:
+            cur_idx = input_node_idx_trace[d]
+            self.idx_trace_list[idx]['compute'].append(cur_idx)
+    
+    def find_trace_from_node(self, node):
+        node_idx = _find_idx_by_name(node.name, self.nodes_list)
+        node_dict = self.idx_trace_list[node_idx]
+        return node_dict['idx'], node_dict['compute']
+    
+    def find_idx_trace_from_node(self, node):
+        node_idx = _find_idx_by_name(node.name, self.nodes_list)
+        node_idx_trace = self.idx_trace_list[node_idx]['idx']
+        return node_idx_trace
+    
+    def assign_index_as_input(self, node, node_idx):
+        input_node_idx = _find_idx_by_name(node.args[0].name, self.nodes_list)
+        input_node_idx_trace = self.idx_trace_list[input_node_idx]['idx']
+        
+        new_idx_trace = copy.deepcopy(input_node_idx_trace)
+        self.idx_trace_list[node_idx]['idx'] = new_idx_trace
+    
+    def assign_all_index(self, node, node_idx):
+        shape = node.meta['tensor_meta'].shape
+        new_trace = []
+        for _ in shape:
+            new_trace.append(self.add_index())
+        self.idx_trace_list[node_idx]['idx'] = new_trace   
+
+    def assign_transpose_index(self, node, node_idx):
+        tranpose_dim = node.args[1:]
+        input_node_idx_trace = self.find_idx_trace_from_node(node.args[0])
+        
+        new_idx_trace = copy.deepcopy(input_node_idx_trace)
+        new_idx_trace[tranpose_dim[0]] = input_node_idx_trace[tranpose_dim[1]]
+        new_idx_trace[tranpose_dim[1]] = input_node_idx_trace[tranpose_dim[0]]
+
+        self.idx_trace_list[node_idx]['idx'] = new_idx_trace
+        
+    def assign_linear_index(self, node, node_idx):
+        input_node, weight, bias = node.args
+        input_node_idx_trace = self.find_idx_trace_from_node(input_node)
+        weight_idx_trace = self.find_idx_trace_from_node(weight)
+        
+        new_idx_trace = copy.deepcopy(input_node_idx_trace)
+        new_idx_trace[-1] = weight_idx_trace[1]
+        self.idx_trace_list[node_idx]['idx'] = new_idx_trace
+
+        self.inherit_computation(input_node, node)
+        self.mark_computation(node, node_idx, [-1])
+        self.mark_idx_equal(input_node_idx_trace[-1], weight_idx_trace[0])
+        
+        if bias:
+            bias_idx_trace = self.find_idx_trace_from_node(bias)
+            self.mark_idx_equal(input_node_idx_trace[-1], bias_idx_trace[0])
+
+    def assign_layernorm_index(self, node, idx):
+        self.assign_index_as_input(node, idx)
+        self.mark_computation(node, idx, [-1, -2])
+            
+    def trace_node_idx(self):
+        for idx, node in enumerate(self.nodes_list):
+            if node.op == 'placeholder':
+                self.assign_all_index(node, idx)
+            elif node.op == 'call_method':
+                if 'transpose' in node.name:
+                    self.assign_transpose_index(node, idx)
+                elif 'view' in node.name:
+                    pass
+                elif 'permute' in node.name:
+                    pass
+                else:
+                    raise NotImplementedError(node.name, "method not implemented yet!")
+            elif node.op == 'call_function':
+                if 'linear' in node.name:
+                    self.assign_linear_index(node, idx)
+                elif 'getattr' in node.name:
+                    continue # get attr like shape
+                elif 'getitem' in node.name:
+                    continue # get item in list
+                else:
+                    raise NotImplementedError(node.name, "function not implemented yet!")
+            elif node.op == 'call_module':
+                if 'layernorm' in node.name:
+                    self.assign_layernorm_index(node, idx)
+                else:
+                    raise NotImplementedError(node.name, "module not implemented yet!")
+            elif node.op == 'get_attr':
+                self.assign_all_index(node, idx) # get param
+            else:
+                raise NotImplementedError(node.op, "op not implemented yet!")
+
 def _get_meta_node_size(x):
     x = x.meta['tensor_meta']
     x = x.numel * torch.tensor([], dtype=x.dtype).element_size()
@@ -557,6 +674,8 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     node_list = list(nodes)
     _estimate_chunk_inference_mem(meta_graph, chunk_starts, chunk_ends, [1], [2])
     _estimate_inference_mem(meta_graph)
+    node_index_tracer = NodeIndexTracer(meta_graph)
+    node_index_tracer.trace_node_idx()
 
     # find the input and output var names for each offload region
     for idx, (start, end) in enumerate(chunk_regions):

From c36dba07defa3069ba65d5aafc53d8292e78cf60 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 14 Nov 2022 23:38:05 +0800
Subject: [PATCH 014/209] finish basic index tracer

---
 chunk_codegen.py | 133 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 124 insertions(+), 9 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 4b8882afc105..8477fe9a1702 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -25,6 +25,7 @@ def __init__(self, gm) -> None:
         self.nodes_list = list(gm.graph.nodes)
         self.idx_trace_list = [{'idx': [], 'compute': []} for _ in range(len(self.nodes_list))] 
         self.idx_trace_equal = []
+        self.idx_view_list = []
         self.idx_count = 1
 
     def add_index(self):
@@ -35,7 +36,7 @@ def inherit_computation(self, node_from, node_to):
         _, compute_from = self.find_trace_from_node(node_from)
         idx_to, compute_to = self.find_trace_from_node(node_to)
         for i in compute_from:
-            if i in idx_to:
+            if i in idx_to and i not in compute_to:
                 compute_to.append(i)
     
     def mark_idx_equal(self, idx1, idx2):
@@ -47,7 +48,8 @@ def mark_computation(self, node, idx, dim):
             dim = [dim]
         for d in dim:
             cur_idx = input_node_idx_trace[d]
-            self.idx_trace_list[idx]['compute'].append(cur_idx)
+            if cur_idx not in self.idx_trace_list[idx]['compute']:
+                self.idx_trace_list[idx]['compute'].append(cur_idx)
     
     def find_trace_from_node(self, node):
         node_idx = _find_idx_by_name(node.name, self.nodes_list)
@@ -56,8 +58,11 @@ def find_trace_from_node(self, node):
     
     def find_idx_trace_from_node(self, node):
         node_idx = _find_idx_by_name(node.name, self.nodes_list)
-        node_idx_trace = self.idx_trace_list[node_idx]['idx']
-        return node_idx_trace
+        return self.idx_trace_list[node_idx]['idx']
+    
+    def find_compute_trace_from_node(self, node):
+        node_idx = _find_idx_by_name(node.name, self.nodes_list)
+        return self.idx_trace_list[node_idx]['compute']
     
     def assign_index_as_input(self, node, node_idx):
         input_node_idx = _find_idx_by_name(node.args[0].name, self.nodes_list)
@@ -82,6 +87,18 @@ def assign_transpose_index(self, node, node_idx):
         new_idx_trace[tranpose_dim[1]] = input_node_idx_trace[tranpose_dim[0]]
 
         self.idx_trace_list[node_idx]['idx'] = new_idx_trace
+        self.inherit_computation(node.args[0], node)
+        
+    def assign_permute_index(self, node, node_idx):
+        permute_dim = node.args[1:]
+        input_node_idx_trace = self.find_idx_trace_from_node(node.args[0])
+        
+        new_idx_trace = copy.deepcopy(input_node_idx_trace)
+        for idx, d in enumerate(permute_dim):
+            new_idx_trace[idx] = input_node_idx_trace[d]
+
+        self.idx_trace_list[node_idx]['idx'] = new_idx_trace
+        self.inherit_computation(node.args[0], node)
         
     def assign_linear_index(self, node, node_idx):
         input_node, weight, bias = node.args
@@ -100,10 +117,99 @@ def assign_linear_index(self, node, node_idx):
             bias_idx_trace = self.find_idx_trace_from_node(bias)
             self.mark_idx_equal(input_node_idx_trace[-1], bias_idx_trace[0])
 
+    def assign_matmul_index(self, node, node_idx):
+        matmul_left, matmul_right = node.args
+        matmul_left_idx_trace = self.find_idx_trace_from_node(matmul_left)
+        matmul_right_idx_trace = self.find_idx_trace_from_node(matmul_right)
+        
+        assert(len(matmul_left_idx_trace) == len(matmul_right_idx_trace))
+        new_idx_trace = copy.deepcopy(matmul_left_idx_trace)
+        new_idx_trace[-1] = matmul_right_idx_trace[-1]
+        self.idx_trace_list[node_idx]['idx'] = new_idx_trace
+
+        self.inherit_computation(matmul_left, node)
+        self.inherit_computation(matmul_right, node)
+        self.mark_computation(node, node_idx, [-1])
+        self.mark_idx_equal(matmul_left_idx_trace[-1], matmul_right_idx_trace[-2])
+
     def assign_layernorm_index(self, node, idx):
         self.assign_index_as_input(node, idx)
+        self.inherit_computation(node.args[0], node)
         self.mark_computation(node, idx, [-1, -2])
-            
+    
+    def assign_elementwise_index(self, node, idx):
+        self.assign_index_as_input(node, idx)
+        for node_in in node.args:
+            if type(node_in) not in (int, float):
+                self.inherit_computation(node_in, node)
+                
+    def assign_softmax_index(self, node, idx):
+        self.assign_index_as_input(node, idx)
+        self.mark_computation(node, idx, [node.kwargs['dim']])
+
+    def assign_view_reshape_index(self, node, node_idx):
+        # get data, turn into number
+        origin_node = node.args[0]
+        origin_shape = origin_node.meta['tensor_meta'].shape
+        target_shape = []
+        for i in range(1, len(node.args)):
+            if isinstance(node.args[i], int):
+                target_shape.append(node.args[i])
+            else:
+                target_shape.append(node.args[i].meta['fwd_out'][0])
+
+        # compute the value of -1
+        if -1 in target_shape:
+            origin_product = 1
+            for i in origin_shape:
+                origin_product *= i
+            target_product = -1
+            for i in target_shape:
+                target_product *= i
+            shape_idx = target_shape.index(-1)
+            target_shape[shape_idx] = origin_product // target_product
+
+        # determine changed dim
+        len_diff = len(origin_shape) - len(target_shape)
+        if len_diff == 1:
+            # dim merge
+            dim_equal = [i == j for i, j in zip(origin_shape[:-1], target_shape)]
+            dim_to = [dim_equal.index(False)]
+            dim_from = [dim_equal.index(False), dim_equal.index(False) + 1]
+        elif len_diff == -1:
+            # dim expand
+            dim_equal = [i == j for i, j in zip(origin_shape, target_shape[:-1])]
+            dim_from = [dim_equal.index(False)]
+            dim_to = [dim_equal.index(False), dim_equal.index(False) + 1]
+        else:
+            raise NotImplementedError("shape" + str(origin_shape) + 'and' + str(target_shape) + "view not implemented")
+
+        # get new index
+        origin_trace = self.find_idx_trace_from_node(origin_node)
+        new_trace = copy.deepcopy(origin_trace)
+        dim_from.reverse()
+        for i in dim_from:
+            new_trace.pop(i)
+        for i in dim_to:
+            new_trace.insert(i, self.add_index())
+        self.idx_trace_list[node_idx]['idx'] = new_trace
+        
+        # inherit computation
+        self.inherit_computation(origin_node, node)
+        compute_log = self.find_compute_trace_from_node(origin_node)
+        for i in dim_from:
+            if origin_trace[i] in compute_log:
+                for j in dim_to:
+                    self.mark_computation(node, node_idx, [j])
+                break
+        
+        # log view
+        view_dict = {"idx_from": [origin_trace[i] for i in dim_from],
+                     "dim_from": dim_from,
+                     "idx_to": [new_trace[i] for i in dim_to],
+                     "dim_to": dim_to}
+        self.idx_view_list.append(view_dict) 
+        
     def trace_node_idx(self):
         for idx, node in enumerate(self.nodes_list):
             if node.op == 'placeholder':
@@ -111,15 +217,21 @@ def trace_node_idx(self):
             elif node.op == 'call_method':
                 if 'transpose' in node.name:
                     self.assign_transpose_index(node, idx)
-                elif 'view' in node.name:
-                    pass
                 elif 'permute' in node.name:
-                    pass
+                    self.assign_permute_index(node, idx)
+                elif 'view' in node.name or 'reshape' in node.name:
+                    self.assign_view_reshape_index(node, idx)
                 else:
                     raise NotImplementedError(node.name, "method not implemented yet!")
             elif node.op == 'call_function':
                 if 'linear' in node.name:
                     self.assign_linear_index(node, idx)
+                elif 'matmul' in node.name:
+                    self.assign_matmul_index(node, idx)
+                elif 'softmax' in node.name:
+                    self.assign_softmax_index(node, idx)
+                elif any(n in node.name for n in ['mul', 'add', 'sigmoid', 'relu']):
+                    self.assign_elementwise_index(node, idx)
                 elif 'getattr' in node.name:
                     continue # get attr like shape
                 elif 'getitem' in node.name:
@@ -127,12 +239,14 @@ def trace_node_idx(self):
                 else:
                     raise NotImplementedError(node.name, "function not implemented yet!")
             elif node.op == 'call_module':
-                if 'layernorm' in node.name:
+                if any(n in node.name for n in ['layernorm', 'norm']):
                     self.assign_layernorm_index(node, idx)
                 else:
                     raise NotImplementedError(node.name, "module not implemented yet!")
             elif node.op == 'get_attr':
                 self.assign_all_index(node, idx) # get param
+            elif node.op == 'output':
+                continue
             else:
                 raise NotImplementedError(node.op, "op not implemented yet!")
 
@@ -297,6 +411,7 @@ def _estimate_chunk_inference_mem(gm: torch.fx.GraphModule, start_nodes, end_nod
         # node is an operation, calculate tmp, output node and delete node memory
         else:
             # forward memory
+            # TODO: permute will create a tmp copy if not contiguous
             act_memory += _get_contiguous_memory(node, not_contiguous_list) * chunk_ratio / (1024 ** 2)
             act_memory += _get_output_node_size(node) * chunk_ratio / (1024 ** 2)
             # record max act memory

From 70a98b8f56e690b75039561a729c5b623d175512 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 14 Nov 2022 23:49:48 +0800
Subject: [PATCH 015/209] add doc string

---
 chunk_codegen.py | 59 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 57 insertions(+), 2 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 8477fe9a1702..aa9d7ecd861f 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -26,13 +26,28 @@ def __init__(self, gm) -> None:
         self.idx_trace_list = [{'idx': [], 'compute': []} for _ in range(len(self.nodes_list))] 
         self.idx_trace_equal = []
         self.idx_view_list = []
-        self.idx_count = 1
+        self.idx_count = -1
 
     def add_index(self):
+        """
+        Update the count and return it. To record the idx number.
+        
+        Returns:
+            idx_count: int
+        """        
         self.idx_count += 1
-        return self.idx_count - 1
+        return self.idx_count
 
     def inherit_computation(self, node_from, node_to):
+        """
+        Inherit computed dim from node_from to node_to.
+        If a dim in node_from is marked as computed and exists in node_to,
+        still mark it as computed in node_to.
+
+        Args:
+            node_from (node): node to be inherited
+            node_to (node): new node to inherit
+        """        
         _, compute_from = self.find_trace_from_node(node_from)
         idx_to, compute_to = self.find_trace_from_node(node_to)
         for i in compute_from:
@@ -40,9 +55,24 @@ def inherit_computation(self, node_from, node_to):
                 compute_to.append(i)
     
     def mark_idx_equal(self, idx1, idx2):
+        """
+        Mark 2 index to be equal.
+
+        Args:
+            idx1 (int): index count.
+            idx2 (int): index count.
+        """        
         self.idx_trace_equal.append((idx1, idx2))
         
     def mark_computation(self, node, idx, dim):
+        """
+        Mark some dims of node as computed.
+
+        Args:
+            node (node)
+            idx (int): node index
+            dim (list or int): dims to be marked as computed
+        """        
         input_node_idx_trace = self.find_idx_trace_from_node(node)
         if isinstance(dim, int):
             dim = [dim]
@@ -52,15 +82,40 @@ def mark_computation(self, node, idx, dim):
                 self.idx_trace_list[idx]['compute'].append(cur_idx)
     
     def find_trace_from_node(self, node):
+        """
+        Find node idx and compute trace by the node.
+
+        Args:
+            node (node)
+        Returns:
+            idx (list): idx of the node
+            compute (list): computed idx of the node.
+        """        
         node_idx = _find_idx_by_name(node.name, self.nodes_list)
         node_dict = self.idx_trace_list[node_idx]
         return node_dict['idx'], node_dict['compute']
     
     def find_idx_trace_from_node(self, node):
+        """
+        Find node idx trace by the node.
+
+        Args:
+            node (node)
+        Returns:
+            idx (list): idx of the node
+        """ 
         node_idx = _find_idx_by_name(node.name, self.nodes_list)
         return self.idx_trace_list[node_idx]['idx']
     
     def find_compute_trace_from_node(self, node):
+        """
+        Find node compute trace by the node.
+
+        Args:
+            node (node)
+        Returns:
+            compute (list): computed idx of the node.
+        """ 
         node_idx = _find_idx_by_name(node.name, self.nodes_list)
         return self.idx_trace_list[node_idx]['compute']
     

From f379d1a94d5ffc7aa4a0c47ffc56cddbf99f4650 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 15 Nov 2022 10:18:00 +0800
Subject: [PATCH 016/209] add doc str

---
 chunk_codegen.py | 95 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index aa9d7ecd861f..a14f7c134985 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -120,6 +120,13 @@ def find_compute_trace_from_node(self, node):
         return self.idx_trace_list[node_idx]['compute']
     
     def assign_index_as_input(self, node, node_idx):
+        """
+        Assign node's trace as its input node.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """        
         input_node_idx = _find_idx_by_name(node.args[0].name, self.nodes_list)
         input_node_idx_trace = self.idx_trace_list[input_node_idx]['idx']
         
@@ -127,6 +134,13 @@ def assign_index_as_input(self, node, node_idx):
         self.idx_trace_list[node_idx]['idx'] = new_idx_trace
     
     def assign_all_index(self, node, node_idx):
+        """
+        Add new index for all node's dims.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """  
         shape = node.meta['tensor_meta'].shape
         new_trace = []
         for _ in shape:
@@ -134,6 +148,15 @@ def assign_all_index(self, node, node_idx):
         self.idx_trace_list[node_idx]['idx'] = new_trace   
 
     def assign_transpose_index(self, node, node_idx):
+        """
+        Assign index for transpose op.
+        1. swap input's dim according to transpose args
+        2. inherit input's computation
+
+        Args:
+            node (node)
+            node_idx (int)
+        """  
         tranpose_dim = node.args[1:]
         input_node_idx_trace = self.find_idx_trace_from_node(node.args[0])
         
@@ -145,6 +168,15 @@ def assign_transpose_index(self, node, node_idx):
         self.inherit_computation(node.args[0], node)
         
     def assign_permute_index(self, node, node_idx):
+        """
+        Assign index for permute op.
+        1. swap input's dim according to permute args
+        2. inherit input's computation
+
+        Args:
+            node (node)
+            node_idx (int)
+        """  
         permute_dim = node.args[1:]
         input_node_idx_trace = self.find_idx_trace_from_node(node.args[0])
         
@@ -156,6 +188,16 @@ def assign_permute_index(self, node, node_idx):
         self.inherit_computation(node.args[0], node)
         
     def assign_linear_index(self, node, node_idx):
+        """
+        Assign index for linear op.
+        1. copy trace from input node and change last index accroding to weight
+        2. mark equal for input node last index, weight first dim and bias dim.
+        3. inherit input's computation, mark computation for last dim.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """  
         input_node, weight, bias = node.args
         input_node_idx_trace = self.find_idx_trace_from_node(input_node)
         weight_idx_trace = self.find_idx_trace_from_node(weight)
@@ -173,6 +215,16 @@ def assign_linear_index(self, node, node_idx):
             self.mark_idx_equal(input_node_idx_trace[-1], bias_idx_trace[0])
 
     def assign_matmul_index(self, node, node_idx):
+        """
+        Assign index for matmul op.
+        1. copy trace from matmul_left and change last index accroding to matmul_right. (assert they have same length)
+        2. mark equal for input matmul_left -1 index and matmul_right -2 dim.
+        3. inherit matmul_left and matmul_right computation, mark computation for last dim.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """  
         matmul_left, matmul_right = node.args
         matmul_left_idx_trace = self.find_idx_trace_from_node(matmul_left)
         matmul_right_idx_trace = self.find_idx_trace_from_node(matmul_right)
@@ -188,21 +240,63 @@ def assign_matmul_index(self, node, node_idx):
         self.mark_idx_equal(matmul_left_idx_trace[-1], matmul_right_idx_trace[-2])
 
     def assign_layernorm_index(self, node, idx):
+        """
+        Assign index for layernorm op.
+        1. assign index as input node
+        2. inherit computation and mark last 2 dims as computed.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
         self.assign_index_as_input(node, idx)
         self.inherit_computation(node.args[0], node)
         self.mark_computation(node, idx, [-1, -2])
     
     def assign_elementwise_index(self, node, idx):
+        """
+        Assign index for element-wise op (eg. relu sigmoid add mul).
+        1. assign index as input node
+        2. inherit computation from all input nodes.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """  
         self.assign_index_as_input(node, idx)
         for node_in in node.args:
             if type(node_in) not in (int, float):
                 self.inherit_computation(node_in, node)
                 
     def assign_softmax_index(self, node, idx):
+        """
+        Assign index for softmax op.
+        1. assign index as input node
+        2. inherit computation and mark softmax dim as computed.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """  
         self.assign_index_as_input(node, idx)
+        self.inherit_computation(node.args[0], node)
         self.mark_computation(node, idx, [node.kwargs['dim']])
 
     def assign_view_reshape_index(self, node, node_idx):
+        """
+        Assign index for view and reshape op.
+        1. get origin shape and target shape by meta info.
+        2. compute the real value of -1 in target shape.
+        3. determine changed dim, and assgin index for generated dim.
+        4. log changed dim and generated dim for restore
+        5. look into view list to see whether the view is associated with other,
+           if so assgin equal dim according to previous view.
+        6. inherit computation.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """  
         # get data, turn into number
         origin_node = node.args[0]
         origin_shape = origin_node.meta['tensor_meta'].shape
@@ -305,6 +399,7 @@ def trace_node_idx(self):
             else:
                 raise NotImplementedError(node.op, "op not implemented yet!")
 
+
 def _get_meta_node_size(x):
     x = x.meta['tensor_meta']
     x = x.numel * torch.tensor([], dtype=x.dtype).element_size()

From 7e2bd1e42892a3021b9882fb0d08f18cfcbcfe86 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 15 Nov 2022 10:36:02 +0800
Subject: [PATCH 017/209] polish code

---
 chunk_codegen.py | 258 ++---------------------------------------------
 1 file changed, 8 insertions(+), 250 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index a14f7c134985..9930a0570436 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -3,20 +3,11 @@
 import copy
 from typing import List, Callable, Any, Tuple, Dict, Iterable
 
-try:
-    from torch.fx.node import Node, Argument, map_arg, _type_repr, _get_qualified_name
-    from torch.fx.graph import _Namespace, PythonCode, _custom_builtins, _is_from_torch, _format_target, magic_methods, CodeGen, _origin_type_map, inplace_methods, _CustomBuiltin
-    from colossalai.fx.profiler import calculate_fwd_out, calculate_fwd_tmp, parameter_size, activation_size
-    CODEGEN_AVAILABLE = True
-except:
-    from torch.fx.graph import _Namespace, PythonCode, _custom_builtins, _is_from_torch, _format_target, magic_methods, _origin_type_map, _format_args, _CustomBuiltin
-    from torch.fx.node import Node, Argument, map_arg, _type_repr, _get_qualified_name
-    CODEGEN_AVAILABLE = False
-
-if CODEGEN_AVAILABLE:
-    __all__ = ['ChunkCodeGen']
-else:
-    __all__ = ['python_code_with_activation_checkpoint']
+from torch.fx.node import Node, Argument, map_arg, _type_repr, _get_qualified_name
+from torch.fx.graph import _Namespace, PythonCode, _custom_builtins, _is_from_torch, _format_target, magic_methods, CodeGen, _origin_type_map, inplace_methods, _CustomBuiltin
+from colossalai.fx.profiler import calculate_fwd_out, calculate_fwd_tmp, parameter_size, activation_size
+CODEGEN_AVAILABLE = True
+__all__ = ['ChunkCodeGen']
 
 
 class NodeIndexTracer(object):
@@ -289,9 +280,9 @@ def assign_view_reshape_index(self, node, node_idx):
         2. compute the real value of -1 in target shape.
         3. determine changed dim, and assgin index for generated dim.
         4. log changed dim and generated dim for restore
-        5. look into view list to see whether the view is associated with other,
+        5. inherit computation.
+        6. TODO: look into view list to see whether the view is associated with other,
            if so assgin equal dim according to previous view.
-        6. inherit computation.
 
         Args:
             node (node)
@@ -352,7 +343,7 @@ def assign_view_reshape_index(self, node, node_idx):
                     self.mark_computation(node, node_idx, [j])
                 break
         
-        # log view
+        # log view, not used now
         view_dict = {"idx_from": [origin_trace[i] for i in dim_from],
                      "dim_from": dim_from,
                      "idx_to": [new_trace[i] for i in dim_to],
@@ -680,239 +671,6 @@ def _find_idx_by_name(name, nodes_list):
         if node.name == name:
             return idx
     raise RuntimeError("name %s not found in node list" % name)
-        
-
-def _find_offload_regions(nodes: List[Node]):
-    """This function is to find the offload regions
-    In pofo algorithm, during annotation, we will annotate the offload region with the 
-    list in the form of [idx, offload_input, offload_bar]. idx indicates the offload
-    region's index, offload_input is a bool type indicates whether we need to offload
-    the input, offload_bar is a bool type indicates whether we need to offload all the
-    intermediate x_bars of this region.
-    """
-    offload_regions = []
-    offload_labels = []
-    start = -1
-    end = -1
-    current_region = None
-
-    for idx, node in enumerate(nodes):
-        if hasattr(node, 'activation_offload') and isinstance(getattr(node, 'activation_offload', None), Iterable):
-            act_offload_label = node.activation_offload
-
-            if current_region == None:
-                current_region = act_offload_label
-                start = idx
-                offload_labels.append(act_offload_label)
-
-            if act_offload_label != current_region:
-                assert start != -1
-                offload_regions.append((start, idx - 1))
-                offload_labels.append(act_offload_label)
-                current_region = act_offload_label
-                start = idx
-                end = -1
-
-        else:
-            if current_region is not None:
-                end = idx - 1
-                assert start != -1 and end != -1
-                offload_regions.append((start, end))
-                start = end = -1
-                current_region = None
-
-            else:
-                pass
-
-    return offload_regions, offload_labels
-
-
-def _gen_ckpt_fn_def(label, free_vars: List[str]) -> str:
-    """
-    Generate the checkpoint function definition
-    """
-    return f"def checkpoint_{label}({', '.join(['self'] + free_vars)}):"
-
-
-def _gen_ckpt_output(output_vars: List[str]) -> str:
-    """
-    Generate the return statement for checkpoint region
-    """
-    return f"return {', '.join(output_vars)}"
-
-
-def _gen_ckpt_usage(label, activation_offload, input_vars, output_vars, use_reentrant=True):
-    """
-    Generate the checkpoint function call code text
-    """
-    outputs = ', '.join(output_vars)
-    inputs = ', '.join(input_vars)
-    return f'{outputs} = colossalai.utils.activation_checkpoint.checkpoint(self.checkpoint_{label}, {activation_offload}, {inputs}, use_reentrant={use_reentrant})'
-
-
-def _end_of_ckpt(node: Node, check_idx: int) -> bool:
-    """Check if the node could end the ckpt region
-
-    Args:
-        node (Node): torch.fx.Node
-        check_idx (int): the index of checkpoint level for 
-        nested checkpoint
-
-    Returns:
-        bool
-    """
-    if hasattr(node, "activation_checkpoint"):
-        if isinstance(node.activation_checkpoint, list):
-            return node.activation_checkpoint[check_idx] == None
-        else:
-            return False
-    else:
-        return True
-
-
-def _find_nested_ckpt_regions(nodes, check_idx=0):
-    """
-    Find the nested checkpoint regions given a list of consecutive nodes. The outputs 
-    will be list of tuples, each tuple is in the form of (start_index, end_index).
-    """
-    ckpt_regions = []
-    start = -1
-    end = -1
-    current_region = None
-
-    for idx, node in enumerate(nodes):
-        if hasattr(node, 'activation_checkpoint'):
-            if isinstance(getattr(node, 'activation_checkpoint'), int):
-                act_ckpt_label = node.activation_checkpoint
-            else:
-                act_ckpt_label = node.activation_checkpoint[check_idx]
-
-            # this activation checkpoint label is not set yet
-            # meaning this is the first node of the activation ckpt region
-            if current_region is None:
-                current_region = act_ckpt_label
-                start = idx
-
-            # if activation checkpoint has changed
-            # we restart the tracking
-            # e.g. node ckpt states = [ckpt1, ckpt2, ckpt2, ckpt2]
-            if act_ckpt_label != current_region:
-                assert start != -1
-                ckpt_regions.append((start, idx - 1))
-                current_region = act_ckpt_label
-                start = idx
-                end = -1
-        elif current_region is not None and _end_of_ckpt(node, check_idx):
-            # used to check the case below
-            # node ckpt states = [ckpt, ckpt, non-ckpt]
-            end = idx - 1
-            assert start != -1 and end != -1
-            ckpt_regions.append((start, end))
-            start = end = -1
-            current_region = None
-        else:
-            pass
-
-    if current_region is not None:
-        end = len(nodes) - 1
-        ckpt_regions.append((start, end))
-    return ckpt_regions
-
-
-def emit_ckpt_func(body,
-                   ckpt_func,
-                   node_list: List[Node],
-                   emit_node_func,
-                   delete_unused_value_func,
-                   level=0,
-                   in_ckpt=False):
-    """Emit ckpt fuction in nested way
-
-    Args:
-        body: forward code, in recursive calls, this part will be checkpoint
-        functions code
-        ckpt_func: checkpoint functions code, in recursive calls, this part
-        will be a buffer
-        node_list (List[Node]): list of torch.fx.Node
-        emit_node_func: function to emit a node
-        delete_unused_value_func: function to delete unused value
-        level (int, optional): checkpoint level. Defaults to 0.
-        in_ckpt (bool, optional): indicates wether the func is in recursive
-        call. Defaults to False.
-    """
-    inputs, outputs = _find_input_and_output_nodes(node_list)
-
-    # if the current checkpoint function use int as label, using old generation method
-    if isinstance(node_list[0].activation_checkpoint, int):
-        label = node_list[0].activation_checkpoint
-        ckpt_fn_def = _gen_ckpt_fn_def(label, inputs)
-        ckpt_func.append(f'{ckpt_fn_def}\n')
-        for node in node_list:
-            emit_node_func(node, ckpt_func)
-            ckpt_func[-1] = '    ' + ckpt_func[-1]
-            delete_unused_value_func(node, ckpt_func)
-
-        ckpt_func.append('    ' + _gen_ckpt_output(outputs) + '\n\n')
-        activation_offload = getattr(node_list[0], "activation_offload", False)
-        usage = _gen_ckpt_usage(label, activation_offload, inputs, outputs, False)
-        usage += "\n"
-        body.append(usage)
-
-    # use nested ckpt function codegen
-    else:
-        # label given by each layer, e.g. if you are currently at level [0, 1, 1]
-        # the label will be '0_1_1'
-        label = "_".join([str(idx) for idx in node_list[0].activation_checkpoint[:level + 1]])
-        ckpt_fn_def = _gen_ckpt_fn_def(label, inputs)
-        ckpt_func.append(f'{ckpt_fn_def}\n')
-
-        # if there is more level to fetch
-        if level + 1 < len(node_list[0].activation_checkpoint):
-            ckpt_regions = _find_nested_ckpt_regions(node_list, level + 1)
-            start_idx = [item[0] for item in ckpt_regions]
-            end_idx = [item[1] for item in ckpt_regions]
-
-            # use ckpt_func_buffer to store nested checkpoint functions
-            ckpt_func_buffer = []
-            node_idx = 0
-            while 1:
-                if node_idx >= len(node_list):
-                    break
-
-                if node_idx in start_idx:
-                    ckpt_node_list = node_list[node_idx:end_idx[start_idx.index(node_idx)] + 1]
-                    emit_ckpt_func(ckpt_func, ckpt_func_buffer, ckpt_node_list, emit_node_func,
-                                   delete_unused_value_func, level + 1, True)
-                    node_idx += len(ckpt_node_list)
-
-                else:
-                    node = node_list[node_idx]
-                    emit_node_func(node, ckpt_func)
-                    ckpt_func[-1] = '    ' + ckpt_func[-1]
-                    delete_unused_value_func(node, ckpt_func)
-                    node_idx += 1
-
-            ckpt_func.append('    ' + _gen_ckpt_output(outputs) + '\n\n')
-            ckpt_func += ckpt_func_buffer
-            activation_offload = getattr(node_list[0], "activation_offload", False)
-            usage = _gen_ckpt_usage(label, activation_offload, inputs, outputs, False) + '\n'
-            if in_ckpt:
-                usage = '    ' + usage
-            body.append(usage)
-
-        # last level
-        else:
-            for node in node_list:
-                emit_node_func(node, ckpt_func)
-                ckpt_func[-1] = '    ' + ckpt_func[-1]
-                delete_unused_value_func(node, ckpt_func)
-
-            ckpt_func.append('    ' + _gen_ckpt_output(outputs) + '\n\n')
-            activation_offload = getattr(node_list[0], "activation_offload", False)
-            usage = _gen_ckpt_usage(label, activation_offload, inputs, outputs, False) + '\n'
-            if in_ckpt:
-                usage = '    ' + usage
-            body.append(usage)
 
 
 def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_value_func, meta_nodes, meta_graph):

From fad3b6d1a65ee04d18e4826045ce3af4e3d28f10 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 15 Nov 2022 10:46:51 +0800
Subject: [PATCH 018/209] polish code

---
 chunk_codegen.py | 478 +++++++++++++++++++++++------------------------
 1 file changed, 239 insertions(+), 239 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 9930a0570436..c1d9e26e790a 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -10,6 +10,13 @@
 __all__ = ['ChunkCodeGen']
 
 
+def _delete_free_var_from_last_use(user_to_last_uses):
+    for key, value in user_to_last_uses.items():
+        for n in value:
+            if n.op == 'placeholder':
+                user_to_last_uses[key].remove(n)
+
+
 class NodeIndexTracer(object):
     def __init__(self, gm) -> None:
         self.gm = gm
@@ -19,7 +26,7 @@ def __init__(self, gm) -> None:
         self.idx_view_list = []
         self.idx_count = -1
 
-    def add_index(self):
+    def _add_index(self):
         """
         Update the count and return it. To record the idx number.
         
@@ -29,7 +36,7 @@ def add_index(self):
         self.idx_count += 1
         return self.idx_count
 
-    def inherit_computation(self, node_from, node_to):
+    def _inherit_computation(self, node_from, node_to):
         """
         Inherit computed dim from node_from to node_to.
         If a dim in node_from is marked as computed and exists in node_to,
@@ -39,13 +46,13 @@ def inherit_computation(self, node_from, node_to):
             node_from (node): node to be inherited
             node_to (node): new node to inherit
         """        
-        _, compute_from = self.find_trace_from_node(node_from)
-        idx_to, compute_to = self.find_trace_from_node(node_to)
+        _, compute_from = self._find_trace_from_node(node_from)
+        idx_to, compute_to = self._find_trace_from_node(node_to)
         for i in compute_from:
             if i in idx_to and i not in compute_to:
                 compute_to.append(i)
     
-    def mark_idx_equal(self, idx1, idx2):
+    def _mark_idx_equal(self, idx1, idx2):
         """
         Mark 2 index to be equal.
 
@@ -55,7 +62,7 @@ def mark_idx_equal(self, idx1, idx2):
         """        
         self.idx_trace_equal.append((idx1, idx2))
         
-    def mark_computation(self, node, idx, dim):
+    def _mark_computation(self, node, idx, dim):
         """
         Mark some dims of node as computed.
 
@@ -64,7 +71,7 @@ def mark_computation(self, node, idx, dim):
             idx (int): node index
             dim (list or int): dims to be marked as computed
         """        
-        input_node_idx_trace = self.find_idx_trace_from_node(node)
+        input_node_idx_trace = self._find_idx_trace_from_node(node)
         if isinstance(dim, int):
             dim = [dim]
         for d in dim:
@@ -72,7 +79,7 @@ def mark_computation(self, node, idx, dim):
             if cur_idx not in self.idx_trace_list[idx]['compute']:
                 self.idx_trace_list[idx]['compute'].append(cur_idx)
     
-    def find_trace_from_node(self, node):
+    def _find_trace_from_node(self, node):
         """
         Find node idx and compute trace by the node.
 
@@ -86,7 +93,7 @@ def find_trace_from_node(self, node):
         node_dict = self.idx_trace_list[node_idx]
         return node_dict['idx'], node_dict['compute']
     
-    def find_idx_trace_from_node(self, node):
+    def _find_idx_trace_from_node(self, node):
         """
         Find node idx trace by the node.
 
@@ -98,7 +105,7 @@ def find_idx_trace_from_node(self, node):
         node_idx = _find_idx_by_name(node.name, self.nodes_list)
         return self.idx_trace_list[node_idx]['idx']
     
-    def find_compute_trace_from_node(self, node):
+    def _find_compute_trace_from_node(self, node):
         """
         Find node compute trace by the node.
 
@@ -110,7 +117,7 @@ def find_compute_trace_from_node(self, node):
         node_idx = _find_idx_by_name(node.name, self.nodes_list)
         return self.idx_trace_list[node_idx]['compute']
     
-    def assign_index_as_input(self, node, node_idx):
+    def _assign_index_as_input(self, node, node_idx):
         """
         Assign node's trace as its input node.
 
@@ -124,7 +131,7 @@ def assign_index_as_input(self, node, node_idx):
         new_idx_trace = copy.deepcopy(input_node_idx_trace)
         self.idx_trace_list[node_idx]['idx'] = new_idx_trace
     
-    def assign_all_index(self, node, node_idx):
+    def _assign_all_index(self, node, node_idx):
         """
         Add new index for all node's dims.
 
@@ -135,10 +142,10 @@ def assign_all_index(self, node, node_idx):
         shape = node.meta['tensor_meta'].shape
         new_trace = []
         for _ in shape:
-            new_trace.append(self.add_index())
+            new_trace.append(self._add_index())
         self.idx_trace_list[node_idx]['idx'] = new_trace   
 
-    def assign_transpose_index(self, node, node_idx):
+    def _assign_transpose_index(self, node, node_idx):
         """
         Assign index for transpose op.
         1. swap input's dim according to transpose args
@@ -149,16 +156,16 @@ def assign_transpose_index(self, node, node_idx):
             node_idx (int)
         """  
         tranpose_dim = node.args[1:]
-        input_node_idx_trace = self.find_idx_trace_from_node(node.args[0])
+        input_node_idx_trace = self._find_idx_trace_from_node(node.args[0])
         
         new_idx_trace = copy.deepcopy(input_node_idx_trace)
         new_idx_trace[tranpose_dim[0]] = input_node_idx_trace[tranpose_dim[1]]
         new_idx_trace[tranpose_dim[1]] = input_node_idx_trace[tranpose_dim[0]]
 
         self.idx_trace_list[node_idx]['idx'] = new_idx_trace
-        self.inherit_computation(node.args[0], node)
+        self._inherit_computation(node.args[0], node)
         
-    def assign_permute_index(self, node, node_idx):
+    def _assign_permute_index(self, node, node_idx):
         """
         Assign index for permute op.
         1. swap input's dim according to permute args
@@ -169,16 +176,16 @@ def assign_permute_index(self, node, node_idx):
             node_idx (int)
         """  
         permute_dim = node.args[1:]
-        input_node_idx_trace = self.find_idx_trace_from_node(node.args[0])
+        input_node_idx_trace = self._find_idx_trace_from_node(node.args[0])
         
         new_idx_trace = copy.deepcopy(input_node_idx_trace)
         for idx, d in enumerate(permute_dim):
             new_idx_trace[idx] = input_node_idx_trace[d]
 
         self.idx_trace_list[node_idx]['idx'] = new_idx_trace
-        self.inherit_computation(node.args[0], node)
+        self._inherit_computation(node.args[0], node)
         
-    def assign_linear_index(self, node, node_idx):
+    def _assign_linear_index(self, node, node_idx):
         """
         Assign index for linear op.
         1. copy trace from input node and change last index accroding to weight
@@ -190,22 +197,22 @@ def assign_linear_index(self, node, node_idx):
             node_idx (int)
         """  
         input_node, weight, bias = node.args
-        input_node_idx_trace = self.find_idx_trace_from_node(input_node)
-        weight_idx_trace = self.find_idx_trace_from_node(weight)
+        input_node_idx_trace = self._find_idx_trace_from_node(input_node)
+        weight_idx_trace = self._find_idx_trace_from_node(weight)
         
         new_idx_trace = copy.deepcopy(input_node_idx_trace)
         new_idx_trace[-1] = weight_idx_trace[1]
         self.idx_trace_list[node_idx]['idx'] = new_idx_trace
 
-        self.inherit_computation(input_node, node)
-        self.mark_computation(node, node_idx, [-1])
-        self.mark_idx_equal(input_node_idx_trace[-1], weight_idx_trace[0])
+        self._inherit_computation(input_node, node)
+        self._mark_computation(node, node_idx, [-1])
+        self._mark_idx_equal(input_node_idx_trace[-1], weight_idx_trace[0])
         
         if bias:
-            bias_idx_trace = self.find_idx_trace_from_node(bias)
-            self.mark_idx_equal(input_node_idx_trace[-1], bias_idx_trace[0])
+            bias_idx_trace = self._find_idx_trace_from_node(bias)
+            self._mark_idx_equal(input_node_idx_trace[-1], bias_idx_trace[0])
 
-    def assign_matmul_index(self, node, node_idx):
+    def _assign_matmul_index(self, node, node_idx):
         """
         Assign index for matmul op.
         1. copy trace from matmul_left and change last index accroding to matmul_right. (assert they have same length)
@@ -217,20 +224,20 @@ def assign_matmul_index(self, node, node_idx):
             node_idx (int)
         """  
         matmul_left, matmul_right = node.args
-        matmul_left_idx_trace = self.find_idx_trace_from_node(matmul_left)
-        matmul_right_idx_trace = self.find_idx_trace_from_node(matmul_right)
+        matmul_left_idx_trace = self._find_idx_trace_from_node(matmul_left)
+        matmul_right_idx_trace = self._find_idx_trace_from_node(matmul_right)
         
         assert(len(matmul_left_idx_trace) == len(matmul_right_idx_trace))
         new_idx_trace = copy.deepcopy(matmul_left_idx_trace)
         new_idx_trace[-1] = matmul_right_idx_trace[-1]
         self.idx_trace_list[node_idx]['idx'] = new_idx_trace
 
-        self.inherit_computation(matmul_left, node)
-        self.inherit_computation(matmul_right, node)
-        self.mark_computation(node, node_idx, [-1])
-        self.mark_idx_equal(matmul_left_idx_trace[-1], matmul_right_idx_trace[-2])
+        self._inherit_computation(matmul_left, node)
+        self._inherit_computation(matmul_right, node)
+        self._mark_computation(node, node_idx, [-1])
+        self._mark_idx_equal(matmul_left_idx_trace[-1], matmul_right_idx_trace[-2])
 
-    def assign_layernorm_index(self, node, idx):
+    def _assign_layernorm_index(self, node, idx):
         """
         Assign index for layernorm op.
         1. assign index as input node
@@ -240,11 +247,11 @@ def assign_layernorm_index(self, node, idx):
             node (node)
             node_idx (int)
         """
-        self.assign_index_as_input(node, idx)
-        self.inherit_computation(node.args[0], node)
-        self.mark_computation(node, idx, [-1, -2])
+        self._assign_index_as_input(node, idx)
+        self._inherit_computation(node.args[0], node)
+        self._mark_computation(node, idx, [-1, -2])
     
-    def assign_elementwise_index(self, node, idx):
+    def _assign_elementwise_index(self, node, idx):
         """
         Assign index for element-wise op (eg. relu sigmoid add mul).
         1. assign index as input node
@@ -254,12 +261,12 @@ def assign_elementwise_index(self, node, idx):
             node (node)
             node_idx (int)
         """  
-        self.assign_index_as_input(node, idx)
+        self._assign_index_as_input(node, idx)
         for node_in in node.args:
             if type(node_in) not in (int, float):
-                self.inherit_computation(node_in, node)
+                self._inherit_computation(node_in, node)
                 
-    def assign_softmax_index(self, node, idx):
+    def _assign_softmax_index(self, node, idx):
         """
         Assign index for softmax op.
         1. assign index as input node
@@ -269,11 +276,11 @@ def assign_softmax_index(self, node, idx):
             node (node)
             node_idx (int)
         """  
-        self.assign_index_as_input(node, idx)
-        self.inherit_computation(node.args[0], node)
-        self.mark_computation(node, idx, [node.kwargs['dim']])
+        self._assign_index_as_input(node, idx)
+        self._inherit_computation(node.args[0], node)
+        self._mark_computation(node, idx, [node.kwargs['dim']])
 
-    def assign_view_reshape_index(self, node, node_idx):
+    def _assign_view_reshape_index(self, node, node_idx):
         """
         Assign index for view and reshape op.
         1. get origin shape and target shape by meta info.
@@ -325,22 +332,22 @@ def assign_view_reshape_index(self, node, node_idx):
             raise NotImplementedError("shape" + str(origin_shape) + 'and' + str(target_shape) + "view not implemented")
 
         # get new index
-        origin_trace = self.find_idx_trace_from_node(origin_node)
+        origin_trace = self._find_idx_trace_from_node(origin_node)
         new_trace = copy.deepcopy(origin_trace)
         dim_from.reverse()
         for i in dim_from:
             new_trace.pop(i)
         for i in dim_to:
-            new_trace.insert(i, self.add_index())
+            new_trace.insert(i, self._add_index())
         self.idx_trace_list[node_idx]['idx'] = new_trace
         
         # inherit computation
-        self.inherit_computation(origin_node, node)
-        compute_log = self.find_compute_trace_from_node(origin_node)
+        self._inherit_computation(origin_node, node)
+        compute_log = self._find_compute_trace_from_node(origin_node)
         for i in dim_from:
             if origin_trace[i] in compute_log:
                 for j in dim_to:
-                    self.mark_computation(node, node_idx, [j])
+                    self._mark_computation(node, node_idx, [j])
                 break
         
         # log view, not used now
@@ -353,25 +360,25 @@ def assign_view_reshape_index(self, node, node_idx):
     def trace_node_idx(self):
         for idx, node in enumerate(self.nodes_list):
             if node.op == 'placeholder':
-                self.assign_all_index(node, idx)
+                self._assign_all_index(node, idx)
             elif node.op == 'call_method':
                 if 'transpose' in node.name:
-                    self.assign_transpose_index(node, idx)
+                    self._assign_transpose_index(node, idx)
                 elif 'permute' in node.name:
-                    self.assign_permute_index(node, idx)
+                    self._assign_permute_index(node, idx)
                 elif 'view' in node.name or 'reshape' in node.name:
-                    self.assign_view_reshape_index(node, idx)
+                    self._assign_view_reshape_index(node, idx)
                 else:
                     raise NotImplementedError(node.name, "method not implemented yet!")
             elif node.op == 'call_function':
                 if 'linear' in node.name:
-                    self.assign_linear_index(node, idx)
+                    self._assign_linear_index(node, idx)
                 elif 'matmul' in node.name:
-                    self.assign_matmul_index(node, idx)
+                    self._assign_matmul_index(node, idx)
                 elif 'softmax' in node.name:
-                    self.assign_softmax_index(node, idx)
+                    self._assign_softmax_index(node, idx)
                 elif any(n in node.name for n in ['mul', 'add', 'sigmoid', 'relu']):
-                    self.assign_elementwise_index(node, idx)
+                    self._assign_elementwise_index(node, idx)
                 elif 'getattr' in node.name:
                     continue # get attr like shape
                 elif 'getitem' in node.name:
@@ -380,206 +387,198 @@ def trace_node_idx(self):
                     raise NotImplementedError(node.name, "function not implemented yet!")
             elif node.op == 'call_module':
                 if any(n in node.name for n in ['layernorm', 'norm']):
-                    self.assign_layernorm_index(node, idx)
+                    self._assign_layernorm_index(node, idx)
                 else:
                     raise NotImplementedError(node.name, "module not implemented yet!")
             elif node.op == 'get_attr':
-                self.assign_all_index(node, idx) # get param
+                self._assign_all_index(node, idx) # get param
             elif node.op == 'output':
                 continue
             else:
                 raise NotImplementedError(node.op, "op not implemented yet!")
 
 
-def _get_meta_node_size(x):
-    x = x.meta['tensor_meta']
-    x = x.numel * torch.tensor([], dtype=x.dtype).element_size()
-    return x
-
+class MemoryEstimator(object):
+    def __init__(self) -> None:
+        pass
 
-def _get_output_node_size(n):
-    fwd_out = {x.uuid: x for x in n.meta["fwd_out"] if isinstance(x, torch.Tensor) and hasattr(x, 'uuid')}
-    return activation_size(fwd_out)
+    def _get_meta_node_size(self, x):
+        x = x.meta['tensor_meta']
+        x = x.numel * torch.tensor([], dtype=x.dtype).element_size()
+        return x
 
+    def _get_output_node_size(self, n):
+        fwd_out = {x.uuid: x for x in n.meta["fwd_out"] if isinstance(x, torch.Tensor) and hasattr(x, 'uuid')}
+        return activation_size(fwd_out)
 
-def _get_delete_node_size(user, user_to_last_uses):
-    if user.op in ('placeholder', 'output'):
+    def _get_delete_node_size(self, user, user_to_last_uses):
+        if user.op in ('placeholder', 'output'):
+            return 0
+        nodes_to_delete = user_to_last_uses.get(user, [])
+        if len(nodes_to_delete):
+            delete_size = sum([self._get_output_node_size(i) for i in nodes_to_delete])
+            return delete_size
         return 0
-    nodes_to_delete = user_to_last_uses.get(user, [])
-    if len(nodes_to_delete):
-        delete_size = sum([_get_output_node_size(i) for i in nodes_to_delete])
-        return delete_size
-    return 0
-
-
-def _get_last_usr(nodes):
-    node_to_last_use: Dict[Node, Node] = {}
-    user_to_last_uses: Dict[Node, List[Node]] = {}
-
-    def register_last_uses(n: Node, user: Node):
-        if n not in node_to_last_use:
-            node_to_last_use[n] = user
-            user_to_last_uses.setdefault(user, []).append(n)
-
-    for node in reversed(nodes):
-        map_arg(node.args, lambda n: register_last_uses(n, node))
-        map_arg(node.kwargs, lambda n: register_last_uses(n, node))
-    return user_to_last_uses
-
-
-def _delete_free_var_from_last_use(user_to_last_uses):
-    for key, value in user_to_last_uses.items():
-        for n in value:
-            if n.op == 'placeholder':
-                user_to_last_uses[key].remove(n)
-
-
-def _get_contiguous_memory(node, not_contiguous_list, delete=False):
-    mem = 0
-    not_contiguous_ops = ['transpose', 'permute']
-
-    if node.op == 'call_function' and 'matmul' in node.name:
-        for n in node.args:
-            if n in not_contiguous_list:
-                # matmul won't change origin tensor, but create a tmp copy
-                mem += _get_output_node_size(n)
-    elif node.op == 'call_module':
-        for n in node.args:
-            if n in not_contiguous_list:
-                # module will just make origin tensor to contiguous
-                if delete:
-                    not_contiguous_list.remove(n)
-    elif node.op == 'call_method' and any(i in node.name for i in not_contiguous_ops):
-        if node not in not_contiguous_list:
-            not_contiguous_list.append(node)
-    elif any(i in node.args for i in not_contiguous_list):
-        if node not in not_contiguous_list:
-            not_contiguous_list.append(node)
-
-    return mem
-
-
-def _estimate_inference_mem(gm: torch.fx.GraphModule):
-    act_memory = 0.0
-    act_memory_peak_log = []
-    act_memory_after_node_log = []
-    not_contiguous_list = []
-    user_to_last_uses = _get_last_usr(list(gm.graph.nodes))
-    _delete_free_var_from_last_use(user_to_last_uses)
-    for node in gm.graph.nodes:
-        # if node is placeholder, just add the size of the node
-        if node.op == 'placeholder':
-            act_memory += _get_meta_node_size(node) / (1024 ** 2)
-            act_memory_peak_log.append(act_memory)
-            act_memory_after_node_log.append(act_memory)
-        # skip output
-        elif node.op == 'output':
-            continue
-        # node is an operation, calculate tmp, output node and delete node memory
-        else:
-            # forward memory
-            act_memory += _get_contiguous_memory(node, not_contiguous_list) / (1024 ** 2)
-            act_memory += _get_output_node_size(node) / (1024 ** 2)
-            # record max act memory
-            act_memory_peak_log.append(act_memory)
-            # delete useless memory
-            act_memory -= _get_delete_node_size(node, user_to_last_uses) / (1024 ** 2)
-            act_memory -= _get_contiguous_memory(node, not_contiguous_list, delete=True) / (1024 ** 2)
-            act_memory_after_node_log.append(act_memory)
 
-    print("no chunk")
-    _print_mem_log(act_memory_peak_log, list(gm.graph.nodes), "peak")
-    _print_mem_log(act_memory_after_node_log, list(gm.graph.nodes), "after")
-    
-    param_memory = parameter_size(gm)
-    return act_memory + param_memory, param_memory
+    def _get_last_usr(self, nodes):
+        node_to_last_use: Dict[Node, Node] = {}
+        user_to_last_uses: Dict[Node, List[Node]] = {}
+
+        def register_last_uses(n: Node, user: Node):
+            if n not in node_to_last_use:
+                node_to_last_use[n] = user
+                user_to_last_uses.setdefault(user, []).append(n)
+
+        for node in reversed(nodes):
+            map_arg(node.args, lambda n: register_last_uses(n, node))
+            map_arg(node.kwargs, lambda n: register_last_uses(n, node))
+        return user_to_last_uses
+
+    def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
+        mem = 0
+        not_contiguous_ops = ['transpose', 'permute']
+
+        if node.op == 'call_function' and 'matmul' in node.name:
+            for n in node.args:
+                if n in not_contiguous_list:
+                    # matmul won't change origin tensor, but create a tmp copy
+                    mem += self._get_output_node_size(n)
+        elif node.op == 'call_module':
+            for n in node.args:
+                if n in not_contiguous_list:
+                    # module will just make origin tensor to contiguous
+                    if delete:
+                        not_contiguous_list.remove(n)
+        elif node.op == 'call_method' and any(i in node.name for i in not_contiguous_ops):
+            if node not in not_contiguous_list:
+                not_contiguous_list.append(node)
+        elif any(i in node.args for i in not_contiguous_list):
+            if node not in not_contiguous_list:
+                not_contiguous_list.append(node)
+
+        return mem
+
+    def estimate_inference_mem(self, gm: torch.fx.GraphModule):
+        act_memory = 0.0
+        act_memory_peak_log = []
+        act_memory_after_node_log = []
+        not_contiguous_list = []
+        user_to_last_uses = self._get_last_usr(list(gm.graph.nodes))
+        _delete_free_var_from_last_use(user_to_last_uses)
+        for node in gm.graph.nodes:
+            # if node is placeholder, just add the size of the node
+            if node.op == 'placeholder':
+                act_memory += self._get_meta_node_size(node) / (1024 ** 2)
+                act_memory_peak_log.append(act_memory)
+                act_memory_after_node_log.append(act_memory)
+            # skip output
+            elif node.op == 'output':
+                continue
+            # node is an operation, calculate tmp, output node and delete node memory
+            else:
+                # forward memory
+                act_memory += self._get_contiguous_memory(node, not_contiguous_list) / (1024 ** 2)
+                act_memory += self._get_output_node_size(node) / (1024 ** 2)
+                # record max act memory
+                act_memory_peak_log.append(act_memory)
+                # delete useless memory
+                act_memory -= self._get_delete_node_size(node, user_to_last_uses) / (1024 ** 2)
+                act_memory -= self._get_contiguous_memory(node, not_contiguous_list, delete=True) / (1024 ** 2)
+                act_memory_after_node_log.append(act_memory)
+
+        print("no chunk")
+        self._print_mem_log(act_memory_peak_log, list(gm.graph.nodes), "peak")
+        self._print_mem_log(act_memory_after_node_log, list(gm.graph.nodes), "after")
+        
+        param_memory = parameter_size(gm)
+        return act_memory + param_memory, param_memory
 
 
-def _get_chunk_ratio(node, chunk_dim, chunk_size):
-    shape = node.meta['tensor_meta'].shape
-    chunk_ratio = float(chunk_size) / shape[chunk_dim]
-    return chunk_ratio
+    def _get_chunk_ratio(self, node, chunk_dim, chunk_size):
+        shape = node.meta['tensor_meta'].shape
+        chunk_ratio = float(chunk_size) / shape[chunk_dim]
+        return chunk_ratio
+
+
+    def _get_chunk_delete_node_size(self, user, user_to_last_uses, chunk_ratio, node_list, start_node, end_node):
+        if user.op in ('placeholder', 'output'):
+            return 0
+        nodes_to_delete = user_to_last_uses.get(user, [])
+        delete_size = 0
+        for n in nodes_to_delete:
+            node_idx = _find_idx_by_name(n.name, node_list)
+            if start_node <= node_idx < end_node:
+                delete_size += self._get_output_node_size(n) * chunk_ratio
+        return delete_size
 
 
-def _get_chunk_delete_node_size(user, user_to_last_uses, chunk_ratio, node_list, start_node, end_node):
-    if user.op in ('placeholder', 'output'):
-        return 0
-    nodes_to_delete = user_to_last_uses.get(user, [])
-    delete_size = 0
-    for n in nodes_to_delete:
-        node_idx = _find_idx_by_name(n.name, node_list)
-        if start_node <= node_idx < end_node:
-            delete_size += _get_output_node_size(n) * chunk_ratio
-    return delete_size
-
-
-def _print_mem_log(log, nodes, title=None):
-    if title:
-        print(title)
-    for idx, (l, n) in enumerate(zip(log, nodes)):
-        print("%s:%.2f \t" % (n.name, l), end='')
-        if (idx + 1) % 3 == 0:
-            print("")
-    print("\n")
-
-
-def _estimate_chunk_inference_mem(gm: torch.fx.GraphModule, start_nodes, end_nodes, chunk_dims, chunk_sizes):
-    act_memory = 0.0
-    act_memory_peak_log = []
-    act_memory_after_node_log = []
-    not_contiguous_list = []
-    user_to_last_uses = _get_last_usr(list(gm.graph.nodes))
-    _delete_free_var_from_last_use(user_to_last_uses)
-    within_chunk = False
-    region_idx = 0
-    chunk_ratio = 1 # use it to estimate chunk mem
-    node_list = list(gm.graph.nodes)
-
-    for idx, node in enumerate(node_list):
-        # if node in chunk start nodes, change chunk ratio and add chunk_tensor
-        if idx in start_nodes:
-            within_chunk = True
-            chunk_ratio = _get_chunk_ratio(node, chunk_dims[region_idx], chunk_sizes[region_idx])
-            act_memory += _get_output_node_size(node_list[end_nodes[region_idx]]) / (1024 ** 2)
-            
-        # if node is placeholder, just add the size of the node
-        if node.op == 'placeholder':
-            act_memory += _get_meta_node_size(node) * chunk_ratio / (1024 ** 2)
-            act_memory_peak_log.append(act_memory)
-        # skip output
-        elif node.op == 'output':
-            continue
-        # node is an operation, calculate tmp, output node and delete node memory
-        else:
-            # forward memory
-            # TODO: permute will create a tmp copy if not contiguous
-            act_memory += _get_contiguous_memory(node, not_contiguous_list) * chunk_ratio / (1024 ** 2)
-            act_memory += _get_output_node_size(node) * chunk_ratio / (1024 ** 2)
-            # record max act memory
-            act_memory_peak_log.append(act_memory)
-            # delete useless memory
-            act_memory -= _get_contiguous_memory(node, not_contiguous_list, delete=True) * chunk_ratio / (1024 ** 2)
-            if within_chunk:
-                act_memory -= _get_chunk_delete_node_size(
-                    node, user_to_last_uses, chunk_ratio, node_list, 
-                    start_nodes[region_idx], end_nodes[region_idx]) / (1024 ** 2)
+    def _print_mem_log(self, log, nodes, title=None):
+        if title:
+            print(title)
+        for idx, (l, n) in enumerate(zip(log, nodes)):
+            print("%s:%.2f \t" % (n.name, l), end='')
+            if (idx + 1) % 3 == 0:
+                print("")
+        print("\n")
+
+
+    def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes, end_nodes, chunk_dims, chunk_sizes):
+        act_memory = 0.0
+        act_memory_peak_log = []
+        act_memory_after_node_log = []
+        not_contiguous_list = []
+        user_to_last_uses = self._get_last_usr(list(gm.graph.nodes))
+        _delete_free_var_from_last_use(user_to_last_uses)
+        within_chunk = False
+        region_idx = 0
+        chunk_ratio = 1 # use it to estimate chunk mem
+        node_list = list(gm.graph.nodes)
+
+        for idx, node in enumerate(node_list):
+            # if node in chunk start nodes, change chunk ratio and add chunk_tensor
+            if idx in start_nodes:
+                within_chunk = True
+                chunk_ratio = self._get_chunk_ratio(node, chunk_dims[region_idx], chunk_sizes[region_idx])
+                act_memory += self._get_output_node_size(node_list[end_nodes[region_idx]]) / (1024 ** 2)
+                
+            # if node is placeholder, just add the size of the node
+            if node.op == 'placeholder':
+                act_memory += self._get_meta_node_size(node) * chunk_ratio / (1024 ** 2)
+                act_memory_peak_log.append(act_memory)
+            # skip output
+            elif node.op == 'output':
+                continue
+            # node is an operation, calculate tmp, output node and delete node memory
             else:
-                act_memory -= _get_delete_node_size(node, user_to_last_uses) / (1024 ** 2)
+                # forward memory
+                # TODO: permute will create a tmp copy if not contiguous
+                act_memory += self._get_contiguous_memory(node, not_contiguous_list) * chunk_ratio / (1024 ** 2)
+                act_memory += self._get_output_node_size(node) * chunk_ratio / (1024 ** 2)
+                # record max act memory
+                act_memory_peak_log.append(act_memory)
+                # delete useless memory
+                act_memory -= self._get_contiguous_memory(node, not_contiguous_list, delete=True) * chunk_ratio / (1024 ** 2)
+                if within_chunk:
+                    act_memory -= self._get_chunk_delete_node_size(
+                        node, user_to_last_uses, chunk_ratio, node_list, 
+                        start_nodes[region_idx], end_nodes[region_idx]) / (1024 ** 2)
+                else:
+                    act_memory -= self._get_delete_node_size(node, user_to_last_uses) / (1024 ** 2)
+                
+            if idx in end_nodes:
+                act_memory -= self._get_output_node_size(node) * chunk_ratio / (1024 ** 2)
+                within_chunk = False
+                chunk_ratio = 1
+                region_idx += 1
             
-        if idx in end_nodes:
-            act_memory -= _get_output_node_size(node) * chunk_ratio / (1024 ** 2)
-            within_chunk = False
-            chunk_ratio = 1
-            region_idx += 1
-        
-        act_memory_after_node_log.append(act_memory)
+            act_memory_after_node_log.append(act_memory)
 
-    print("chunk")
-    _print_mem_log(act_memory_peak_log, node_list, "peak")
-    _print_mem_log(act_memory_after_node_log, node_list, "after")
+        print("chunk")
+        self._print_mem_log(act_memory_peak_log, node_list, "peak")
+        self._print_mem_log(act_memory_after_node_log, node_list, "after")
 
-    param_memory = parameter_size(gm)
-    return act_memory + param_memory, param_memory
+        param_memory = parameter_size(gm)
+        return act_memory + param_memory, param_memory
 
 
 def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
@@ -695,8 +694,9 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     within_chunk_region = False
 
     node_list = list(nodes)
-    _estimate_chunk_inference_mem(meta_graph, chunk_starts, chunk_ends, [1], [2])
-    _estimate_inference_mem(meta_graph)
+    memory_estimator = MemoryEstimator()
+    memory_estimator.estimate_chunk_inference_mem(meta_graph, chunk_starts, chunk_ends, [1], [2])
+    memory_estimator.estimate_inference_mem(meta_graph)
     node_index_tracer = NodeIndexTracer(meta_graph)
     node_index_tracer.trace_node_idx()
 

From 54a34a7e46d2f9e0234eb9295f3507e720ba21b2 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 15 Nov 2022 11:30:43 +0800
Subject: [PATCH 019/209] update active log

---
 chunk_codegen.py | 56 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 43 insertions(+), 13 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index c1d9e26e790a..ade986d1e343 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -407,18 +407,41 @@ def _get_meta_node_size(self, x):
         x = x.numel * torch.tensor([], dtype=x.dtype).element_size()
         return x
 
-    def _get_output_node_size(self, n):
+    def _get_output_node(self, n):
         fwd_out = {x.uuid: x for x in n.meta["fwd_out"] if isinstance(x, torch.Tensor) and hasattr(x, 'uuid')}
-        return activation_size(fwd_out)
+        out_size = activation_size(fwd_out)
+        out_node = [n.name] if out_size > 0 else []
+        return out_size, out_node
+    
+    def _get_output_node_size(self, n):
+        return self._get_output_node(n)[0]
+    
+    def _add_active_node(self, n, active_list):
+        new_active = self._get_output_node(n)[1]
+        for i in new_active:
+            if i not in active_list:
+                active_list.append(i)
 
+    def _get_delete_node(self, user, user_to_last_uses):
+        delete_size = 0
+        delete_node = []
+        if user.op not in ('placeholder', 'output'):
+            nodes_to_delete = user_to_last_uses.get(user, [])
+            if len(nodes_to_delete):
+                out_node = [self._get_output_node(i) for i in nodes_to_delete]
+                delete_size = sum([i[0] for i in out_node])
+                for i in range(len(out_node)):
+                    if out_node[i][0] > 0:
+                        delete_node.append(out_node[i][1][0])
+        return delete_size, delete_node
+    
     def _get_delete_node_size(self, user, user_to_last_uses):
-        if user.op in ('placeholder', 'output'):
-            return 0
-        nodes_to_delete = user_to_last_uses.get(user, [])
-        if len(nodes_to_delete):
-            delete_size = sum([self._get_output_node_size(i) for i in nodes_to_delete])
-            return delete_size
-        return 0
+        return self._get_delete_node(user, user_to_last_uses)[0]
+    
+    def _remove_active_node(self, user, user_to_last_uses, active_list):
+        delete_node = self._get_delete_node(user, user_to_last_uses)[1]
+        for i in delete_node:
+            active_list.remove(i)
 
     def _get_last_usr(self, nodes):
         node_to_last_use: Dict[Node, Node] = {}
@@ -438,7 +461,7 @@ def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
         mem = 0
         not_contiguous_ops = ['transpose', 'permute']
 
-        if node.op == 'call_function' and 'matmul' in node.name:
+        if node.op == 'call_function' and any(n in node.name for n in ['matmul', 'reshape']):
             for n in node.args:
                 if n in not_contiguous_list:
                     # matmul won't change origin tensor, but create a tmp copy
@@ -463,6 +486,8 @@ def estimate_inference_mem(self, gm: torch.fx.GraphModule):
         act_memory_peak_log = []
         act_memory_after_node_log = []
         not_contiguous_list = []
+        active_node_list = []
+        active_node_list_log = []
         user_to_last_uses = self._get_last_usr(list(gm.graph.nodes))
         _delete_free_var_from_last_use(user_to_last_uses)
         for node in gm.graph.nodes:
@@ -470,7 +495,7 @@ def estimate_inference_mem(self, gm: torch.fx.GraphModule):
             if node.op == 'placeholder':
                 act_memory += self._get_meta_node_size(node) / (1024 ** 2)
                 act_memory_peak_log.append(act_memory)
-                act_memory_after_node_log.append(act_memory)
+                active_node_list.append(node.name)
             # skip output
             elif node.op == 'output':
                 continue
@@ -484,8 +509,12 @@ def estimate_inference_mem(self, gm: torch.fx.GraphModule):
                 # delete useless memory
                 act_memory -= self._get_delete_node_size(node, user_to_last_uses) / (1024 ** 2)
                 act_memory -= self._get_contiguous_memory(node, not_contiguous_list, delete=True) / (1024 ** 2)
-                act_memory_after_node_log.append(act_memory)
+                # log active node
+                self._add_active_node(node, active_node_list)
+                self._remove_active_node(node, user_to_last_uses, active_node_list)
 
+            act_memory_after_node_log.append(act_memory)
+            active_node_list_log.append(copy.deepcopy(active_node_list))
         print("no chunk")
         self._print_mem_log(act_memory_peak_log, list(gm.graph.nodes), "peak")
         self._print_mem_log(act_memory_after_node_log, list(gm.graph.nodes), "after")
@@ -551,7 +580,6 @@ def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes, en
             # node is an operation, calculate tmp, output node and delete node memory
             else:
                 # forward memory
-                # TODO: permute will create a tmp copy if not contiguous
                 act_memory += self._get_contiguous_memory(node, not_contiguous_list) * chunk_ratio / (1024 ** 2)
                 act_memory += self._get_output_node_size(node) * chunk_ratio / (1024 ** 2)
                 # record max act memory
@@ -694,9 +722,11 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     within_chunk_region = False
 
     node_list = list(nodes)
+
     memory_estimator = MemoryEstimator()
     memory_estimator.estimate_chunk_inference_mem(meta_graph, chunk_starts, chunk_ends, [1], [2])
     memory_estimator.estimate_inference_mem(meta_graph)
+
     node_index_tracer = NodeIndexTracer(meta_graph)
     node_index_tracer.trace_node_idx()
 

From d9ca2f898d1fb2a2b76ba663ebb27b9a778bd0ed Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 15 Nov 2022 15:50:50 +0800
Subject: [PATCH 020/209] polish code

---
 chunk_codegen.py | 87 +++++++++++++++---------------------------------
 1 file changed, 27 insertions(+), 60 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index ade986d1e343..77aca8deb81f 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -438,7 +438,7 @@ def _get_delete_node(self, user, user_to_last_uses):
     def _get_delete_node_size(self, user, user_to_last_uses):
         return self._get_delete_node(user, user_to_last_uses)[0]
     
-    def _remove_active_node(self, user, user_to_last_uses, active_list):
+    def _remove_deactive_node(self, user, user_to_last_uses, active_list):
         delete_node = self._get_delete_node(user, user_to_last_uses)[1]
         for i in delete_node:
             active_list.remove(i)
@@ -481,48 +481,6 @@ def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
 
         return mem
 
-    def estimate_inference_mem(self, gm: torch.fx.GraphModule):
-        act_memory = 0.0
-        act_memory_peak_log = []
-        act_memory_after_node_log = []
-        not_contiguous_list = []
-        active_node_list = []
-        active_node_list_log = []
-        user_to_last_uses = self._get_last_usr(list(gm.graph.nodes))
-        _delete_free_var_from_last_use(user_to_last_uses)
-        for node in gm.graph.nodes:
-            # if node is placeholder, just add the size of the node
-            if node.op == 'placeholder':
-                act_memory += self._get_meta_node_size(node) / (1024 ** 2)
-                act_memory_peak_log.append(act_memory)
-                active_node_list.append(node.name)
-            # skip output
-            elif node.op == 'output':
-                continue
-            # node is an operation, calculate tmp, output node and delete node memory
-            else:
-                # forward memory
-                act_memory += self._get_contiguous_memory(node, not_contiguous_list) / (1024 ** 2)
-                act_memory += self._get_output_node_size(node) / (1024 ** 2)
-                # record max act memory
-                act_memory_peak_log.append(act_memory)
-                # delete useless memory
-                act_memory -= self._get_delete_node_size(node, user_to_last_uses) / (1024 ** 2)
-                act_memory -= self._get_contiguous_memory(node, not_contiguous_list, delete=True) / (1024 ** 2)
-                # log active node
-                self._add_active_node(node, active_node_list)
-                self._remove_active_node(node, user_to_last_uses, active_node_list)
-
-            act_memory_after_node_log.append(act_memory)
-            active_node_list_log.append(copy.deepcopy(active_node_list))
-        print("no chunk")
-        self._print_mem_log(act_memory_peak_log, list(gm.graph.nodes), "peak")
-        self._print_mem_log(act_memory_after_node_log, list(gm.graph.nodes), "after")
-        
-        param_memory = parameter_size(gm)
-        return act_memory + param_memory, param_memory
-
-
     def _get_chunk_ratio(self, node, chunk_dim, chunk_size):
         shape = node.meta['tensor_meta'].shape
         chunk_ratio = float(chunk_size) / shape[chunk_dim]
@@ -550,25 +508,28 @@ def _print_mem_log(self, log, nodes, title=None):
                 print("")
         print("\n")
 
-
-    def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes, end_nodes, chunk_dims, chunk_sizes):
+    def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes=None, end_nodes=None, chunk_dims=None, chunk_sizes=None):
         act_memory = 0.0
         act_memory_peak_log = []
         act_memory_after_node_log = []
+        active_node_list = []
+        active_node_list_log = []
         not_contiguous_list = []
+        node_list = list(gm.graph.nodes)
         user_to_last_uses = self._get_last_usr(list(gm.graph.nodes))
         _delete_free_var_from_last_use(user_to_last_uses)
-        within_chunk = False
-        region_idx = 0
+        
+        use_chunk = all(i is not None for i in [start_nodes, end_nodes, chunk_dims, chunk_sizes])
+        chunk_within = False
+        chunk_region_idx = 0
         chunk_ratio = 1 # use it to estimate chunk mem
-        node_list = list(gm.graph.nodes)
 
         for idx, node in enumerate(node_list):
             # if node in chunk start nodes, change chunk ratio and add chunk_tensor
-            if idx in start_nodes:
-                within_chunk = True
-                chunk_ratio = self._get_chunk_ratio(node, chunk_dims[region_idx], chunk_sizes[region_idx])
-                act_memory += self._get_output_node_size(node_list[end_nodes[region_idx]]) / (1024 ** 2)
+            if use_chunk and idx in start_nodes:
+                chunk_within = True
+                chunk_ratio = self._get_chunk_ratio(node, chunk_dims[chunk_region_idx], chunk_sizes[chunk_region_idx])
+                act_memory += self._get_output_node_size(node_list[end_nodes[chunk_region_idx]]) / (1024 ** 2)
                 
             # if node is placeholder, just add the size of the node
             if node.op == 'placeholder':
@@ -586,22 +547,28 @@ def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes, en
                 act_memory_peak_log.append(act_memory)
                 # delete useless memory
                 act_memory -= self._get_contiguous_memory(node, not_contiguous_list, delete=True) * chunk_ratio / (1024 ** 2)
-                if within_chunk:
+                if chunk_within:
                     act_memory -= self._get_chunk_delete_node_size(
                         node, user_to_last_uses, chunk_ratio, node_list, 
-                        start_nodes[region_idx], end_nodes[region_idx]) / (1024 ** 2)
+                        start_nodes[chunk_region_idx], end_nodes[chunk_region_idx]) / (1024 ** 2)
                 else:
                     act_memory -= self._get_delete_node_size(node, user_to_last_uses) / (1024 ** 2)
-                
-            if idx in end_nodes:
+
+            # log active node
+            self._add_active_node(node, active_node_list)
+            self._remove_deactive_node(node, user_to_last_uses, active_node_list)
+
+            # if node in chunk end nodes, restore chunk settings
+            if use_chunk and idx in end_nodes:
                 act_memory -= self._get_output_node_size(node) * chunk_ratio / (1024 ** 2)
-                within_chunk = False
+                chunk_within = False
                 chunk_ratio = 1
-                region_idx += 1
+                chunk_region_idx += 1
             
             act_memory_after_node_log.append(act_memory)
+            active_node_list_log.append(copy.deepcopy(active_node_list))
 
-        print("chunk")
+        print("with chunk" if use_chunk else "without chunk")
         self._print_mem_log(act_memory_peak_log, node_list, "peak")
         self._print_mem_log(act_memory_after_node_log, node_list, "after")
 
@@ -725,7 +692,7 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
 
     memory_estimator = MemoryEstimator()
     memory_estimator.estimate_chunk_inference_mem(meta_graph, chunk_starts, chunk_ends, [1], [2])
-    memory_estimator.estimate_inference_mem(meta_graph)
+    memory_estimator.estimate_chunk_inference_mem(meta_graph)
 
     node_index_tracer = NodeIndexTracer(meta_graph)
     node_index_tracer.trace_node_idx()

From 7330d907459a220ebedaeafbbcc7c3cff3c8b1c4 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Sun, 4 Dec 2022 17:05:28 +0800
Subject: [PATCH 021/209] add possible region search

---
 chunk_codegen.py | 116 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 109 insertions(+), 7 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 77aca8deb81f..ba83f7fec3be 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -356,7 +356,17 @@ def _assign_view_reshape_index(self, node, node_idx):
                      "idx_to": [new_trace[i] for i in dim_to],
                      "dim_to": dim_to}
         self.idx_view_list.append(view_dict) 
-        
+    
+    def _merge_equal_idx(self):
+        idx_equal = copy.deepcopy(self.idx_trace_equal)
+        idx_equal.reverse()
+        for idx in idx_equal:
+            merge_to = min(idx)
+            merge_from = max(idx)
+            for trace in self.idx_trace_list:
+                if merge_from in trace['idx']:
+                    trace['idx'] = [merge_to if i == merge_from else i for i in trace['idx']]
+    
     def trace_node_idx(self):
         for idx, node in enumerate(self.nodes_list):
             if node.op == 'placeholder':
@@ -396,6 +406,7 @@ def trace_node_idx(self):
                 continue
             else:
                 raise NotImplementedError(node.op, "op not implemented yet!")
+        self._merge_equal_idx()
 
 
 class MemoryEstimator(object):
@@ -433,6 +444,8 @@ def _get_delete_node(self, user, user_to_last_uses):
                 for i in range(len(out_node)):
                     if out_node[i][0] > 0:
                         delete_node.append(out_node[i][1][0])
+                    elif nodes_to_delete[i].op == 'placeholder':
+                        delete_node.append(nodes_to_delete[i].name)
         return delete_size, delete_node
     
     def _get_delete_node_size(self, user, user_to_last_uses):
@@ -516,8 +529,9 @@ def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes=Non
         active_node_list_log = []
         not_contiguous_list = []
         node_list = list(gm.graph.nodes)
-        user_to_last_uses = self._get_last_usr(list(gm.graph.nodes))
-        _delete_free_var_from_last_use(user_to_last_uses)
+        user_to_last_uses = self._get_last_usr(node_list)
+        user_to_last_uses_no_free_var = self._get_last_usr(node_list)
+        _delete_free_var_from_last_use(user_to_last_uses_no_free_var)
         
         use_chunk = all(i is not None for i in [start_nodes, end_nodes, chunk_dims, chunk_sizes])
         chunk_within = False
@@ -535,6 +549,7 @@ def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes=Non
             if node.op == 'placeholder':
                 act_memory += self._get_meta_node_size(node) * chunk_ratio / (1024 ** 2)
                 act_memory_peak_log.append(act_memory)
+                active_node_list.append(node.name)
             # skip output
             elif node.op == 'output':
                 continue
@@ -549,10 +564,10 @@ def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes=Non
                 act_memory -= self._get_contiguous_memory(node, not_contiguous_list, delete=True) * chunk_ratio / (1024 ** 2)
                 if chunk_within:
                     act_memory -= self._get_chunk_delete_node_size(
-                        node, user_to_last_uses, chunk_ratio, node_list, 
+                        node, user_to_last_uses_no_free_var, chunk_ratio, node_list, 
                         start_nodes[chunk_region_idx], end_nodes[chunk_region_idx]) / (1024 ** 2)
                 else:
-                    act_memory -= self._get_delete_node_size(node, user_to_last_uses) / (1024 ** 2)
+                    act_memory -= self._get_delete_node_size(node, user_to_last_uses_no_free_var) / (1024 ** 2)
 
             # log active node
             self._add_active_node(node, active_node_list)
@@ -572,8 +587,92 @@ def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes=Non
         self._print_mem_log(act_memory_peak_log, node_list, "peak")
         self._print_mem_log(act_memory_after_node_log, node_list, "after")
 
-        param_memory = parameter_size(gm)
-        return act_memory + param_memory, param_memory
+        # param_memory = parameter_size(gm)
+        # all_memory = act_memory + param_memory
+        return act_memory_peak_log, act_memory_after_node_log, active_node_list_log
+
+
+class ChunkRegionSearch(object):
+    def __init__(self, gm) -> None:
+        self.gm = gm
+        self.node_list = list(gm.graph.nodes)
+        self.memory_estimator = MemoryEstimator()
+        self.index_tracer = NodeIndexTracer(gm)
+        self.index_tracer.trace_node_idx()
+
+    def _find_peak_node(self, mem_peak):
+        max_value = max(mem_peak)
+        max_idx = [mem_peak.index(max_value)]
+        return max_idx
+    
+    def _get_free_var(self):
+        free_var_idx = []
+        for idx, n in enumerate(self.node_list):
+            if n.op == 'placeholder':
+                free_var_idx.append(idx)
+        return free_var_idx
+    
+    def _get_min_free_var(self, active_node_list, free_vars):
+        min_len = 999
+        for idx, n in enumerate(active_node_list):
+            if idx in free_vars:
+                continue
+            if len(n) < min_len:
+                min_len = len(n)
+        return min_len
+    
+    def _search_max_chunk_region(self, active_node, peak_node):
+        free_vars = self._get_free_var()
+        min_var = self._get_min_free_var(active_node, free_vars)
+        
+        # from peak_node to free_var
+        chunk_region_start = None
+        for i in range(peak_node, -1, -1):
+            if len(active_node[i]) == min_var:
+                chunk_region_start = i + 1
+                break
+            if i in free_vars or i == 0:
+                raise RuntimeError()
+        # from peak_node to len-2
+        chunk_region_end = None
+        for i in range(peak_node, len(active_node) - 1):
+            if len(active_node[i]) == min_var:
+                chunk_region_end = i - 1
+                break
+            if i in free_vars or i == 0:
+                raise RuntimeError()
+        return chunk_region_start, chunk_region_end
+    
+    def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
+        possible_chunk_region = []
+        for before_idx in range(max_chunk_region[0], peak_node):
+            for after_idx in range(peak_node, max_chunk_region[1]):
+                # skip non compute nodes
+                if any(op in ['placeholder', 'get_attr', 'output'] for op in 
+                       [self.node_list[before_idx].op, self.node_list[after_idx].op]):
+                    continue
+                if any(any(i in name for i in ['getitem', 'getattr']) for name in 
+                       [self.node_list[before_idx].name, self.node_list[after_idx].name]):
+                    continue
+                
+                # select free dim
+                before_trace = self.index_tracer.idx_trace_list[before_idx]
+                after_trace = self.index_tracer.idx_trace_list[after_idx]
+                free_dim = []
+                for i in range(min(len(before_trace['idx']), len(after_trace['idx']))):
+                   if (before_trace['idx'][i] == after_trace['idx'][i] and 
+                       before_trace['idx'][i] not in before_trace['compute'] and
+                       after_trace['idx'][i] not in after_trace['compute']):
+                       free_dim.append(i)
+                possible_chunk_region.append({'region': (before_idx, after_idx), 'dim': free_dim})
+        return possible_chunk_region
+    
+    def search_region(self):
+        mem_peak, mem_after, active_node = self.memory_estimator.estimate_chunk_inference_mem(self.gm)
+        peak_nodes = self._find_peak_node(mem_peak)
+        for idx, peak_node in enumerate(peak_nodes):
+            max_chunk_region = self._search_max_chunk_region(active_node, peak_node)
+            possible_chunk_regions = self._search_possible_chunk_regions(max_chunk_region, peak_node)
 
 
 def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
@@ -696,6 +795,9 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
 
     node_index_tracer = NodeIndexTracer(meta_graph)
     node_index_tracer.trace_node_idx()
+    
+    chunk_region_search = ChunkRegionSearch(meta_graph)
+    chunk_region_search.search_region()
 
     # find the input and output var names for each offload region
     for idx, (start, end) in enumerate(chunk_regions):

From 3b7d6712065b65d9c93feb64a488739e4483981f Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 6 Dec 2022 11:08:39 +0800
Subject: [PATCH 022/209] finish region search loop

---
 chunk_codegen.py     | 152 ++++++++++++++++++++++++++++++++-----------
 chunk_codegen_run.py |   4 +-
 2 files changed, 116 insertions(+), 40 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index ba83f7fec3be..47cda0f8ed20 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -21,7 +21,7 @@ class NodeIndexTracer(object):
     def __init__(self, gm) -> None:
         self.gm = gm
         self.nodes_list = list(gm.graph.nodes)
-        self.idx_trace_list = [{'idx': [], 'compute': []} for _ in range(len(self.nodes_list))] 
+        self.idx_trace_list = [{'idx': [], 'compute': {}} for _ in range(len(self.nodes_list))] 
         self.idx_trace_equal = []
         self.idx_view_list = []
         self.idx_count = -1
@@ -48,9 +48,12 @@ def _inherit_computation(self, node_from, node_to):
         """        
         _, compute_from = self._find_trace_from_node(node_from)
         idx_to, compute_to = self._find_trace_from_node(node_to)
-        for i in compute_from:
-            if i in idx_to and i not in compute_to:
-                compute_to.append(i)
+        for k, v in compute_from.items():
+            if k in idx_to:
+                if k in compute_to:
+                    compute_to[k].extend(v)
+                else:
+                    compute_to[k] = copy.deepcopy(v)
     
     def _mark_idx_equal(self, idx1, idx2):
         """
@@ -77,7 +80,9 @@ def _mark_computation(self, node, idx, dim):
         for d in dim:
             cur_idx = input_node_idx_trace[d]
             if cur_idx not in self.idx_trace_list[idx]['compute']:
-                self.idx_trace_list[idx]['compute'].append(cur_idx)
+                self.idx_trace_list[idx]['compute'][cur_idx] = [idx]
+            else:
+                self.idx_trace_list[idx]['compute'][cur_idx].append(idx)
     
     def _find_trace_from_node(self, node):
         """
@@ -357,6 +362,11 @@ def _assign_view_reshape_index(self, node, node_idx):
                      "dim_to": dim_to}
         self.idx_view_list.append(view_dict) 
     
+    def _remove_duplicate_compute(self):
+        for i in self.idx_trace_list:
+            for k, v in i['compute'].items():
+                i['compute'][k] = list(set(v))
+    
     def _merge_equal_idx(self):
         idx_equal = copy.deepcopy(self.idx_trace_equal)
         idx_equal.reverse()
@@ -406,6 +416,8 @@ def trace_node_idx(self):
                 continue
             else:
                 raise NotImplementedError(node.op, "op not implemented yet!")
+            
+        self._remove_duplicate_compute()
         self._merge_equal_idx()
 
 
@@ -521,6 +533,19 @@ def _print_mem_log(self, log, nodes, title=None):
                 print("")
         print("\n")
 
+    def _print_compute_op_mem_log(self, log, nodes, title=None):
+        if title:
+            print(title)
+        for idx, (l, n) in enumerate(zip(log, nodes)):
+            if n.op in ['placeholder', 'get_attr', 'output']:
+                continue
+            if any(i in n.name for i in ['getitem', 'getattr']):
+                continue
+            print("%s:%.2f \t" % (n.name, l), end='')
+            if (idx + 1) % 3 == 0:
+                print("")
+        print("\n")
+    
     def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes=None, end_nodes=None, chunk_dims=None, chunk_sizes=None):
         act_memory = 0.0
         act_memory_peak_log = []
@@ -584,8 +609,10 @@ def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes=Non
             active_node_list_log.append(copy.deepcopy(active_node_list))
 
         print("with chunk" if use_chunk else "without chunk")
-        self._print_mem_log(act_memory_peak_log, node_list, "peak")
-        self._print_mem_log(act_memory_after_node_log, node_list, "after")
+        # self._print_mem_log(act_memory_peak_log, node_list, "peak")
+        # self._print_mem_log(act_memory_after_node_log, node_list, "after")
+        self._print_compute_op_mem_log(act_memory_peak_log, node_list, "peak")
+        self._print_compute_op_mem_log(act_memory_after_node_log, node_list, "after")
 
         # param_memory = parameter_size(gm)
         # all_memory = act_memory + param_memory
@@ -602,7 +629,7 @@ def __init__(self, gm) -> None:
 
     def _find_peak_node(self, mem_peak):
         max_value = max(mem_peak)
-        max_idx = [mem_peak.index(max_value)]
+        max_idx = mem_peak.index(max_value)
         return max_idx
     
     def _get_free_var(self):
@@ -635,18 +662,35 @@ def _search_max_chunk_region(self, active_node, peak_node):
                 raise RuntimeError()
         # from peak_node to len-2
         chunk_region_end = None
-        for i in range(peak_node, len(active_node) - 1):
+        for i in range(peak_node, len(active_node)):
             if len(active_node[i]) == min_var:
-                chunk_region_end = i - 1
+                chunk_region_end = i
                 break
             if i in free_vars or i == 0:
                 raise RuntimeError()
         return chunk_region_start, chunk_region_end
     
+    def _not_compute(self, trace, chunk_range, dim_idx):
+        if trace['idx'][dim_idx] not in trace['compute']:
+            return True
+        if trace['idx'][dim_idx] in trace['compute'] and \
+            all(i < chunk_range[0] or i > chunk_range[1] for i in trace['compute'][trace['idx'][dim_idx]]):
+            return True
+        return False
+    
     def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
         possible_chunk_region = []
+        output_trace = copy.deepcopy(self.index_tracer.idx_trace_list)
+        input_trace = []
+        for i, n in enumerate(self.node_list):
+            if len(n.args) > 0 and n.op != 'output':
+                input_idx = _find_idx_by_name(n.args[0].name, self.node_list)
+                input_trace.append(output_trace[input_idx])
+            else:
+                input_trace.append(None)
+
         for before_idx in range(max_chunk_region[0], peak_node):
-            for after_idx in range(peak_node, max_chunk_region[1]):
+            for after_idx in range(peak_node, max_chunk_region[1] + 1):
                 # skip non compute nodes
                 if any(op in ['placeholder', 'get_attr', 'output'] for op in 
                        [self.node_list[before_idx].op, self.node_list[after_idx].op]):
@@ -656,23 +700,59 @@ def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
                     continue
                 
                 # select free dim
-                before_trace = self.index_tracer.idx_trace_list[before_idx]
-                after_trace = self.index_tracer.idx_trace_list[after_idx]
+                before_trace = input_trace[before_idx]
+                after_trace = output_trace[after_idx]
                 free_dim = []
                 for i in range(min(len(before_trace['idx']), len(after_trace['idx']))):
                    if (before_trace['idx'][i] == after_trace['idx'][i] and 
-                       before_trace['idx'][i] not in before_trace['compute'] and
-                       after_trace['idx'][i] not in after_trace['compute']):
+                       self._not_compute(before_trace, (before_idx, after_idx), i) and
+                       self._not_compute(after_trace, (before_idx, after_idx), i) and
+                       self.node_list[after_idx].meta['tensor_meta'].shape[i] != 1):
                        free_dim.append(i)
                 possible_chunk_region.append({'region': (before_idx, after_idx), 'dim': free_dim})
         return possible_chunk_region
     
+    def _search_best_chunk_region(self, possible_chunk_regions):
+        max_region_range = 0
+        best_regions = None
+        for i in possible_chunk_regions:
+            if i['region'][1] - i['region'][0] > max_region_range:
+                best_regions = i
+                max_region_range = i['region'][1] - i['region'][0]
+        return best_regions
+    
+    def _step_search(self, peak_node, active_node):
+        max_chunk_region = self._search_max_chunk_region(active_node, peak_node)
+        possible_chunk_regions = self._search_possible_chunk_regions(max_chunk_region, peak_node)
+        best_chunk_region = self._search_best_chunk_region(possible_chunk_regions)
+        return best_chunk_region
+    
+    def _stop_search(self, init_mem_peak, mem_peak):
+        sorted_init_mem_peak = sorted(init_mem_peak)
+        if max(mem_peak) < sorted_init_mem_peak[int(len(sorted_init_mem_peak) * 0.5)]:
+            return True
+        return False
+    
     def search_region(self):
-        mem_peak, mem_after, active_node = self.memory_estimator.estimate_chunk_inference_mem(self.gm)
-        peak_nodes = self._find_peak_node(mem_peak)
-        for idx, peak_node in enumerate(peak_nodes):
-            max_chunk_region = self._search_max_chunk_region(active_node, peak_node)
-            possible_chunk_regions = self._search_possible_chunk_regions(max_chunk_region, peak_node)
+        chunk_regions = []
+        init_mem_peak, _, active_node = self.memory_estimator.estimate_chunk_inference_mem(self.gm)
+        mem_peak = init_mem_peak
+        
+        while True:
+            peak_node = self._find_peak_node(mem_peak)
+            chunk_region = self._step_search(peak_node, active_node)
+            if chunk_region is None or len(chunk_region['dim']) == 0:
+                break
+            
+            chunk_regions.append(chunk_region)
+            mem_peak, _, active_node = self.memory_estimator.estimate_chunk_inference_mem(
+                self.gm, [i['region'][0] for i in chunk_regions], 
+                [i['region'][1] for i in chunk_regions], [i['dim'][0] for i in chunk_regions], [1] * len(chunk_regions))
+            
+            if self._stop_search(init_mem_peak, mem_peak):
+                break
+
+        return chunk_regions
 
 
 def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
@@ -696,11 +776,12 @@ def _get_first_non_single_dim(shape):
     raise RuntimeError("can not get first non single dim for shape", shape)
 
 
-def _gen_loop_start(chunk_input_meta, chunk_output, chunk_size=2):
+def _gen_loop_start(chunk_input_meta, chunk_output, chunk_dim, chunk_size=2):
     if len(chunk_input_meta) == 1:
         node = chunk_input_meta[0]
         node_shape = node.meta['tensor_meta'].shape
-        chunk_dim = _get_first_non_single_dim(node_shape)
+        free_shape = [node_shape[i] if i in chunk_dim else 1 for i in range(len(node_shape))]
+        chunk_dim = _get_first_non_single_dim(free_shape)
         chunk_slice = _gen_chunk_slice_dim(chunk_dim, "gen_chunk_idx", node_shape)
         out_shape = str(list(chunk_output.meta['tensor_meta'].shape))
         
@@ -713,12 +794,13 @@ def _gen_loop_start(chunk_input_meta, chunk_output, chunk_size=2):
     return context
 
 
-def _gen_loop_end(chunk_outputs, chunk_inputs, node_list):
+def _gen_loop_end(chunk_outputs, chunk_inputs, node_list, chunk_dim):
     chunk_inputs_name = chunk_inputs[0].name
     chunk_outputs_name = chunk_outputs.name
     chunk_outputs_idx = _find_idx_by_name(chunk_outputs_name, node_list)
     chunk_output_shape = chunk_outputs.meta['tensor_meta'].shape
-    chunk_dim = _get_first_non_single_dim(chunk_output_shape)
+    free_shape = [chunk_output_shape[i] if i in chunk_dim else 1 for i in range(len(chunk_output_shape))]
+    chunk_dim = _get_first_non_single_dim(free_shape)
     chunk_slice = _gen_chunk_slice_dim(chunk_dim, "gen_chunk_idx", chunk_output_shape)
     context = "    chunk_result%s = %s\n" % (chunk_slice, chunk_outputs_name)
 
@@ -780,7 +862,11 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     """
 
     # find the offload regions
-    chunk_regions = [(58, 62)]
+    chunk_region_search = ChunkRegionSearch(meta_graph)
+    chunk_search = chunk_region_search.search_region()
+    chunk_regions = [i['region'] for i in chunk_search]
+    chunk_dims = [i['dim'] for i in chunk_search]
+    
     chunk_starts = [item[0] for item in chunk_regions]
     chunk_ends = [item[1] for item in chunk_regions]
     chunk_inputs = []
@@ -789,16 +875,6 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
 
     node_list = list(nodes)
 
-    memory_estimator = MemoryEstimator()
-    memory_estimator.estimate_chunk_inference_mem(meta_graph, chunk_starts, chunk_ends, [1], [2])
-    memory_estimator.estimate_chunk_inference_mem(meta_graph)
-
-    node_index_tracer = NodeIndexTracer(meta_graph)
-    node_index_tracer.trace_node_idx()
-    
-    chunk_region_search = ChunkRegionSearch(meta_graph)
-    chunk_region_search.search_region()
-
     # find the input and output var names for each offload region
     for idx, (start, end) in enumerate(chunk_regions):
         offload_node_list = node_list[start:end + 1]
@@ -824,13 +900,13 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
                 
             # add for loop
             chunk_input_meta = [meta_nodes[i] for i in chunk_inputs_idx[region_idx]]
-            body.append(_gen_loop_start(chunk_input_meta, node_list[chunk_ends[region_idx]]))
+            body.append(_gen_loop_start(chunk_input_meta, node_list[chunk_ends[region_idx]], chunk_dims[region_idx]))
 
         if within_chunk_region:
             emit_node_func(node, body)
             # replace input var with chunk var
             if node_idx in chunk_starts:
-                body[-1] = body[-1].replace("("+ chunk_inputs[region_idx][0].name +")", '(chunk_tensor)')
+                body[-1] = body[-1].replace(chunk_inputs[region_idx][0].name, 'chunk_tensor')
             body[-1] = '    ' + body[-1]
             delete_unused_value_func(node, body, chunk_inputs_names)
 
@@ -840,7 +916,7 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
                 delete_unused_value_func(node, body, chunk_inputs_names)
 
         if node_idx in chunk_ends:
-            body.append(_gen_loop_end(node, chunk_inputs[region_idx], node_list))
+            body.append(_gen_loop_end(node, chunk_inputs[region_idx], node_list, chunk_dims[region_idx]))
             within_chunk_region = False
             region_idx += 1
 
diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
index 39363a80abcb..88c734903392 100644
--- a/chunk_codegen_run.py
+++ b/chunk_codegen_run.py
@@ -45,8 +45,8 @@ def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
     with torch.no_grad():
         non_fx_out = model(node, pair)
         fx_out = gm(node, pair)
-    assert torch.equal(non_fx_out[0], fx_out[0]), "fx_out doesn't comply with original output"
-    assert torch.equal(non_fx_out[1], fx_out[1]), "fx_out doesn't comply with original output"
+    assert torch.allclose(non_fx_out[0], fx_out[0], atol=1e-6), "fx_out doesn't comply with original output"
+    assert torch.allclose(non_fx_out[1], fx_out[1], atol=1e-6), "fx_out doesn't comply with original output"
 
     # test barckward
     # loss0 = non_fx_out[0].sum() + non_fx_out[1].sum()

From f24c418bb04a1e65eaa0f6cf8aada466deca2598 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 6 Dec 2022 16:29:07 +0800
Subject: [PATCH 023/209] finish chunk define

---
 chunk_codegen.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 47cda0f8ed20..6740cd44ab6a 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -827,7 +827,7 @@ def _find_input_and_output_nodes(nodes: List[Node]):
     for node in nodes:
         for input_node in node._input_nodes.keys():
             node_repr = repr(input_node)
-            if input_node not in nodes and node_repr not in input_nodes:
+            if input_node not in nodes and input_node not in input_nodes:
                 input_nodes.append(input_node)
 
     # if a node has a user node which is not in the node list
@@ -835,7 +835,7 @@ def _find_input_and_output_nodes(nodes: List[Node]):
     for node in nodes:
         for output_node in node.users.keys():
             node_repr = repr(node)
-            if output_node not in nodes and node_repr not in output_nodes:
+            if output_node not in nodes and output_node not in output_nodes:
                 output_nodes.append(output_node)
 
     return input_nodes, output_nodes
@@ -848,6 +848,16 @@ def _find_idx_by_name(name, nodes_list):
     raise RuntimeError("name %s not found in node list" % name)
 
 
+def _replace_name(context, name_from, name_to):
+    patterns = [(" ", " "), (" ", "."), (" ", ","), ("(", ")"), ("(", ",")]
+    for p in patterns:
+        source = p[0] + name_from + p[1]
+        target = p[0] + name_to + p[1]
+        if source in context:
+            context = context.replace(source, target)
+    return context
+
+
 def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_value_func, meta_nodes, meta_graph):
     """Emit code with nested activation checkpoint
     When we detect some of the node.activation_checkpoint is a List, we will use
@@ -905,8 +915,7 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
         if within_chunk_region:
             emit_node_func(node, body)
             # replace input var with chunk var
-            if node_idx in chunk_starts:
-                body[-1] = body[-1].replace(chunk_inputs[region_idx][0].name, 'chunk_tensor')
+            body[-1] = _replace_name(body[-1], chunk_inputs[region_idx][0].name, 'chunk_tensor')
             body[-1] = '    ' + body[-1]
             delete_unused_value_func(node, body, chunk_inputs_names)
 

From a9d64377bb237f34fdafaeec2abcfdfb6e080091 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 6 Dec 2022 17:34:24 +0800
Subject: [PATCH 024/209] support new op

---
 chunk_codegen.py | 63 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 61 insertions(+), 2 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 6740cd44ab6a..2dc44d381d85 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -200,8 +200,12 @@ def _assign_linear_index(self, node, node_idx):
         Args:
             node (node)
             node_idx (int)
-        """  
-        input_node, weight, bias = node.args
+        """
+        if len(node.args) == 2:
+            input_node, weight = node.args
+            bias = None
+        else:
+            input_node, weight, bias = node.args
         input_node_idx_trace = self._find_idx_trace_from_node(input_node)
         weight_idx_trace = self._find_idx_trace_from_node(weight)
         
@@ -284,6 +288,53 @@ def _assign_softmax_index(self, node, idx):
         self._assign_index_as_input(node, idx)
         self._inherit_computation(node.args[0], node)
         self._mark_computation(node, idx, [node.kwargs['dim']])
+        
+    def _assign_unsqueeze_index(self, node, node_idx):
+        """
+        Assign index for unsqueeze op.
+        1. assign new index for unsqueeze dim
+
+        Args:
+            node (node)
+            node_idx (int)
+        """ 
+        self._assign_index_as_input(node, node_idx)
+        self._inherit_computation(node.args[0], node)
+        self.idx_trace_list[node_idx]['idx'].insert(node.args[1], self._add_index())
+        
+    def _assign_dropout_index(self, node, node_idx):
+        """
+        Assign index for unsqueeze op.
+        1. assign new index for unsqueeze dim
+
+        Args:
+            node (node)
+            node_idx (int)
+        """ 
+        self._assign_index_as_input(node, node_idx)
+
+        
+    def _assign_ones_like_index(self, node, node_idx):
+        """
+        Assign index for oneslike op.
+        1. assign new index for all dim
+
+        Args:
+            node (node)
+            node_idx (int)
+        """ 
+        self._assign_all_index(node, node_idx)
+        
+    def _assign_to_index(self, node, node_idx):
+        """
+        Assign index for to op.
+        1. assign new index for all dim
+
+        Args:
+            node (node)
+            node_idx (int)
+        """ 
+        self._assign_index_as_input(node, node_idx)
 
     def _assign_view_reshape_index(self, node, node_idx):
         """
@@ -388,6 +439,10 @@ def trace_node_idx(self):
                     self._assign_permute_index(node, idx)
                 elif 'view' in node.name or 'reshape' in node.name:
                     self._assign_view_reshape_index(node, idx)
+                elif 'unsqueeze' in node.name:
+                    self._assign_unsqueeze_index(node, idx)
+                elif 'to' in node.name:
+                    self._assign_to_index(node, idx)
                 else:
                     raise NotImplementedError(node.name, "method not implemented yet!")
             elif node.op == 'call_function':
@@ -399,6 +454,10 @@ def trace_node_idx(self):
                     self._assign_softmax_index(node, idx)
                 elif any(n in node.name for n in ['mul', 'add', 'sigmoid', 'relu']):
                     self._assign_elementwise_index(node, idx)
+                elif 'ones_like' in node.name:
+                    self._assign_ones_like_index(node, idx)
+                elif 'dropout' in node.name:
+                    self._assign_dropout_index(node, idx)
                 elif 'getattr' in node.name:
                     continue # get attr like shape
                 elif 'getitem' in node.name:

From 6d99994a7afbfe290bcd798804b4e1e7e76d1281 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 6 Dec 2022 17:35:27 +0800
Subject: [PATCH 025/209] rename index tracer

---
 chunk_codegen.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 2dc44d381d85..0f97f94a9d21 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -17,7 +17,7 @@ def _delete_free_var_from_last_use(user_to_last_uses):
                 user_to_last_uses[key].remove(n)
 
 
-class NodeIndexTracer(object):
+class IndexTracer(object):
     def __init__(self, gm) -> None:
         self.gm = gm
         self.nodes_list = list(gm.graph.nodes)
@@ -683,7 +683,7 @@ def __init__(self, gm) -> None:
         self.gm = gm
         self.node_list = list(gm.graph.nodes)
         self.memory_estimator = MemoryEstimator()
-        self.index_tracer = NodeIndexTracer(gm)
+        self.index_tracer = IndexTracer(gm)
         self.index_tracer.trace_node_idx()
 
     def _find_peak_node(self, mem_peak):

From 2b4ebcc27839b34c015c4fb79e69abd721b83ee6 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 8 Dec 2022 15:16:10 +0800
Subject: [PATCH 026/209] finishi codegen on msa

---
 chunk_codegen.py | 212 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 188 insertions(+), 24 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 0f97f94a9d21..1e8305ba395b 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -17,6 +17,121 @@ def _delete_free_var_from_last_use(user_to_last_uses):
                 user_to_last_uses[key].remove(n)
 
 
+class FlowTracer(object):
+    def __init__(self, gm) -> None:
+        self.gm = gm
+        self.nodes_list = list(gm.graph.nodes)
+        self.flow_trace = {}
+
+    def _add_trace(self, name):
+        self.flow_trace[name] = []
+    
+    def _add_node(self, trace_name, node):
+        self.flow_trace[trace_name].append({'node': node, 'inside_depend': [], 'outside_depend': []})
+    
+    def _add_inside_depend(self, flow_name, node, inside_depend_node):
+        for i in self.flow_trace[flow_name]:
+            if i['node'] == node:
+                i['inside_depend'].append(inside_depend_node)
+                return
+        raise RuntimeError("node not found")
+                
+    def _add_outside_depend(self, flow_name, node, outside_depend_node, outside_depend_trace):
+        for i in self.flow_trace[flow_name]:
+            if i['node'] == node:
+                i['outside_depend'].append({outside_depend_trace: outside_depend_node}) 
+                return
+        raise RuntimeError("node not found")
+
+    def _init_trace(self):
+        for i in self.nodes_list:
+            if i.op == 'placeholder':
+                self._add_trace(i.name)
+                self._add_node(i.name, i)
+
+    def _is_non_compute_node(self, node):
+        if any(i in node.op for i in ['placeholder', 'get_attr', 'output']) or \
+            any(i in node.name for i in ['getitem', 'getattr']):
+            return True
+        return False
+    
+    def _is_non_compute_node_except_placeholder(self, node):
+        if any(i in node.op for i in ['get_attr', 'output']) or \
+            any(i in node.name for i in ['getitem', 'getattr']):
+            return True
+        return False
+    
+    def _find_flow_for_node(self, node):
+        if type(self.nodes_list[0]) != type(node):
+            return None
+        if self._is_non_compute_node_except_placeholder(node):
+            return None
+        for name, trace in self.flow_trace.items():
+            for i in trace:
+                if node == i['node']:
+                    return name
+        if any(i in node.name for i in ["ones_like"]):
+            self._add_trace(node.name)
+            self._add_node(node.name, node)
+            return node.name
+        raise RuntimeError("node not found")
+    
+    def _find_first_valid_flow(self, flow):
+        for i in flow:
+            if i is not None:
+                return i
+        raise RuntimeError("invalid flow")
+    
+    def find_node_flow(self, node):
+        for name, trace in self.flow_trace.items():
+            for i in trace:
+                if node == i['node']:
+                    return name, i
+        raise RuntimeError("invalid node")
+        
+    def get_flow_mix(self, node):
+        if self._is_non_compute_node(node):
+            return None
+        _, node_trace = self.find_node_flow(node)
+        if len(node_trace['outside_depend']) == 0:
+            return None
+        elif len(node_trace['outside_depend']) > 1:
+            raise NotImplementedError
+        vars = list(node_trace['outside_depend'][0].values())[0]
+        return vars
+    
+    def get_same_flow_node(self, node_list, node):
+        name, _ = self.find_node_flow(node)
+        result = []
+        for i in self.flow_trace[name]:
+            if i['node'] in node_list:
+                result.append(i['node'])
+        return result
+        
+    def trace_flow(self):    
+        # init trace
+        self._init_trace()
+
+        for node in self.nodes_list:
+            # skip if non compute node
+            if all(type(arg) != type(node) or self._is_non_compute_node_except_placeholder(arg) for arg in node.args) \
+                or self._is_non_compute_node(node):
+                continue
+
+            node_input_flows = [self._find_flow_for_node(arg) for arg in node.args]
+
+            node_domin_flow = self._find_first_valid_flow(node_input_flows)
+            self._add_node(node_domin_flow, node)
+            for node_input_flow, arg in zip(node_input_flows, node.args):
+                if node_input_flow is None:
+                    continue
+                elif node_input_flow == node_domin_flow:
+                    self._add_inside_depend(node_domin_flow, node, arg)
+                else:
+                    self._add_outside_depend(node_domin_flow, node, arg, node_input_flow)
+        return self.flow_trace
+
+
 class IndexTracer(object):
     def __init__(self, gm) -> None:
         self.gm = gm
@@ -428,7 +543,7 @@ def _merge_equal_idx(self):
                 if merge_from in trace['idx']:
                     trace['idx'] = [merge_to if i == merge_from else i for i in trace['idx']]
     
-    def trace_node_idx(self):
+    def trace_index(self):
         for idx, node in enumerate(self.nodes_list):
             if node.op == 'placeholder':
                 self._assign_all_index(node, idx)
@@ -684,7 +799,9 @@ def __init__(self, gm) -> None:
         self.node_list = list(gm.graph.nodes)
         self.memory_estimator = MemoryEstimator()
         self.index_tracer = IndexTracer(gm)
-        self.index_tracer.trace_node_idx()
+        self.index_tracer.trace_index()
+        self.flow_tracer = FlowTracer(gm)
+        self.flow_tracer.trace_flow()
 
     def _find_peak_node(self, mem_peak):
         max_value = max(mem_peak)
@@ -729,7 +846,7 @@ def _search_max_chunk_region(self, active_node, peak_node):
                 raise RuntimeError()
         return chunk_region_start, chunk_region_end
     
-    def _not_compute(self, trace, chunk_range, dim_idx):
+    def _is_not_compute(self, trace, chunk_range, dim_idx):
         if trace['idx'][dim_idx] not in trace['compute']:
             return True
         if trace['idx'][dim_idx] in trace['compute'] and \
@@ -737,6 +854,56 @@ def _not_compute(self, trace, chunk_range, dim_idx):
             return True
         return False
     
+    def _detect_flow(self, before_trace, after_trace, start_idx, end_idx, dim_idx):
+        inputs, outputs = _find_input_and_output_nodes(self.node_list[start_idx:end_idx + 1])
+        chunk_info = {'inputs': inputs, 'outputs': outputs}
+        flow_flag = False
+        
+        for idx in range(start_idx, end_idx + 1):
+            node = self.node_list[idx]
+            mix_flow_var = self.flow_tracer.get_flow_mix(node)
+            if mix_flow_var is None:
+                continue
+            
+            # if there is a flow mix, op must be in [mul, add, div, matmul]
+            # element-wise op requires dim to be equal in every dim
+            if any(n in node.name for n in ['mul', 'add']):
+                for i in node.args:
+                    if type(i) == type(mix_flow_var) and i != mix_flow_var:
+                        main_flow_var = i
+                # if mix flow is a broadcast in chunk dim, 
+                # TODO need to move that flow out of the chunk
+                if mix_flow_var.meta['tensor_meta'].shape[dim_idx] == 1:
+                    flow_flag = True
+                    for i in self.flow_tracer.get_same_flow_node(chunk_info['inputs'], mix_flow_var):
+                        chunk_info['inputs'].remove(i)
+                # else, we need to chunk mix var as well
+                else:
+                    # TODO chunk another value
+                    flow_flag = False
+                    break
+            else:
+                raise NotImplementedError("%s not implemented" % node.name)
+        return flow_flag, chunk_info
+    
+    def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
+        before_trace = input_trace[start_idx]
+        after_trace = output_trace[end_idx]
+        free_dim = []
+        chunk_infos = []
+        for i in range(min(len(before_trace['idx']), len(after_trace['idx']))):
+            if not (before_trace['idx'][i] == after_trace['idx'][i] and 
+                self._is_not_compute(before_trace, (start_idx, end_idx), i) and
+                self._is_not_compute(after_trace, (start_idx, end_idx), i) and
+                self.node_list[end_idx].meta['tensor_meta'].shape[i] != 1):
+                continue
+            flow_flag, chunk_info = self._detect_flow(before_trace, after_trace, start_idx, end_idx, i)
+            if flow_flag == None:
+                continue
+            chunk_infos.append(chunk_info)
+            free_dim.append(i)
+        return free_dim, chunk_infos
+
     def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
         possible_chunk_region = []
         output_trace = copy.deepcopy(self.index_tracer.idx_trace_list)
@@ -748,27 +915,22 @@ def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
             else:
                 input_trace.append(None)
 
-        for before_idx in range(max_chunk_region[0], peak_node):
-            for after_idx in range(peak_node, max_chunk_region[1] + 1):
+        for start_idx in range(max_chunk_region[0], peak_node):
+            for end_idx in range(peak_node, max_chunk_region[1] + 1):
                 # skip non compute nodes
                 if any(op in ['placeholder', 'get_attr', 'output'] for op in 
-                       [self.node_list[before_idx].op, self.node_list[after_idx].op]):
+                       [self.node_list[start_idx].op, self.node_list[end_idx].op]):
                     continue
                 if any(any(i in name for i in ['getitem', 'getattr']) for name in 
-                       [self.node_list[before_idx].name, self.node_list[after_idx].name]):
+                       [self.node_list[start_idx].name, self.node_list[end_idx].name]):
                     continue
                 
                 # select free dim
-                before_trace = input_trace[before_idx]
-                after_trace = output_trace[after_idx]
-                free_dim = []
-                for i in range(min(len(before_trace['idx']), len(after_trace['idx']))):
-                   if (before_trace['idx'][i] == after_trace['idx'][i] and 
-                       self._not_compute(before_trace, (before_idx, after_idx), i) and
-                       self._not_compute(after_trace, (before_idx, after_idx), i) and
-                       self.node_list[after_idx].meta['tensor_meta'].shape[i] != 1):
-                       free_dim.append(i)
-                possible_chunk_region.append({'region': (before_idx, after_idx), 'dim': free_dim})
+                free_dim, chunk_info = self._find_free_dim(input_trace, output_trace, start_idx, end_idx)
+                if len(free_dim) > 0:
+                    free_dim = [free_dim[0]]
+                    chunk_info = [chunk_info[0]]
+                possible_chunk_region.append({'region': (start_idx, end_idx), 'dim': free_dim, 'chunk_info': chunk_info})
         return possible_chunk_region
     
     def _search_best_chunk_region(self, possible_chunk_regions):
@@ -935,21 +1097,23 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     chunk_search = chunk_region_search.search_region()
     chunk_regions = [i['region'] for i in chunk_search]
     chunk_dims = [i['dim'] for i in chunk_search]
+    chunk_infos = [i['chunk_info'] for i in chunk_search]
     
     chunk_starts = [item[0] for item in chunk_regions]
     chunk_ends = [item[1] for item in chunk_regions]
-    chunk_inputs = []
-    chunk_outputs = []
+    chunk_inputs = [[j['inputs'][0] for j in i] for i in chunk_infos]
+    chunk_outputs = [[j['outputs'][0] for j in i] for i in chunk_infos]
     within_chunk_region = False
 
     node_list = list(nodes)
 
     # find the input and output var names for each offload region
-    for idx, (start, end) in enumerate(chunk_regions):
-        offload_node_list = node_list[start:end + 1]
-        inputs, outputs = _find_input_and_output_nodes(offload_node_list)
-        chunk_inputs.append(inputs)
-        chunk_outputs.append(outputs)
+    # for idx, (start, end) in enumerate(chunk_regions):
+    #     offload_node_list = node_list[start:end + 1]
+    #     inputs, outputs = _find_input_and_output_nodes(offload_node_list)
+    #     chunk_inputs.append(inputs)
+    #     chunk_outputs.append(outputs)
+    
     chunk_inputs_idx = [[_find_idx_by_name(j.name, node_list) for j in i] for i in chunk_inputs]
     chunk_outputs_idx = [[_find_idx_by_name(j.name, node_list) for j in i] for i in chunk_outputs]
     chunk_inputs_names = []

From 979e61db92a95b8bc2904c5b38264f24060be310 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 9 Dec 2022 17:39:02 +0800
Subject: [PATCH 027/209] redesign index tracer, add source and change compute

---
 chunk_codegen.py | 310 +++++++++++++++++++++++++++++++----------------
 1 file changed, 206 insertions(+), 104 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 1e8305ba395b..ce7d849178d1 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -16,6 +16,11 @@ def _delete_free_var_from_last_use(user_to_last_uses):
             if n.op == 'placeholder':
                 user_to_last_uses[key].remove(n)
 
+def _get_node_shape(node):
+    if hasattr(node.meta['tensor_meta'], "shape"):
+        return node.meta['tensor_meta'].shape
+    return None
+
 
 class FlowTracer(object):
     def __init__(self, gm) -> None:
@@ -136,11 +141,25 @@ class IndexTracer(object):
     def __init__(self, gm) -> None:
         self.gm = gm
         self.nodes_list = list(gm.graph.nodes)
-        self.idx_trace_list = [{'idx': [], 'compute': {}} for _ in range(len(self.nodes_list))] 
+        self.idx_trace_list = self._init_idx_trace_list()
         self.idx_trace_equal = []
         self.idx_view_list = []
         self.idx_count = -1
 
+    def _init_idx_trace_list(self):
+        idx_trace_list = []
+        for n in self.nodes_list:
+            if _get_node_shape(n) != None:            
+                cur_trace = {
+                    'idx': [None for _ in range(len(_get_node_shape(n)))],
+                    'compute': [[] for _ in range(len(_get_node_shape(n)))],
+                    'source': [[] for _ in range(len(_get_node_shape(n)))],
+                }
+            else:
+                cur_trace = {'idx': [], 'compute': [], 'source': []}
+            idx_trace_list.append(cur_trace)
+        return idx_trace_list
+    
     def _add_index(self):
         """
         Update the count and return it. To record the idx number.
@@ -150,35 +169,81 @@ def _add_index(self):
         """        
         self.idx_count += 1
         return self.idx_count
-
-    def _inherit_computation(self, node_from, node_to):
-        """
-        Inherit computed dim from node_from to node_to.
-        If a dim in node_from is marked as computed and exists in node_to,
-        still mark it as computed in node_to.
-
-        Args:
-            node_from (node): node to be inherited
-            node_to (node): new node to inherit
-        """        
-        _, compute_from = self._find_trace_from_node(node_from)
-        idx_to, compute_to = self._find_trace_from_node(node_to)
-        for k, v in compute_from.items():
-            if k in idx_to:
-                if k in compute_to:
-                    compute_to[k].extend(v)
-                else:
-                    compute_to[k] = copy.deepcopy(v)
     
-    def _mark_idx_equal(self, idx1, idx2):
+    def _del_dim(self, idx, dim_idx):
+        self.idx_trace_list[idx]['idx'].pop(dim_idx)
+        self.idx_trace_list[idx]['compute'].pop(dim_idx)
+        self.idx_trace_list[idx]['source'].pop(dim_idx)
+    
+    def _add_dim(self, idx, dim_idx):
+        self.idx_trace_list[idx]['idx'].insert(dim_idx, self._add_index())
+        self.idx_trace_list[idx]['compute'].insert(dim_idx, [])
+        self.idx_trace_list[idx]['source'].insert(dim_idx, [])
+    
+    def _transform_index(self, node, node_dim):
+        node_idx = self._find_idx_trace_from_node(node)
+        dims = list(range(len(node_idx)))
+        return dims[node_dim]
+    
+    def _inherit_index(self, node_from, node_from_dim, node_to, node_to_dim):
+        node_from_dim = self._transform_index(node_from, node_from_dim)
+        node_to_dim = self._transform_index(node_to, node_to_dim)
+        node_from_trace = self._find_trace_from_node(node_from)
+        node_to_trace = self._find_trace_from_node(node_to)
+        node_to_trace['idx'][node_to_dim] = node_from_trace['idx'][node_from_dim]
+        node_to_trace['compute'][node_to_dim] = copy.deepcopy(node_from_trace['compute'][node_from_dim])
+        node_from_idx = _find_idx_by_name(node_from.name, self.nodes_list)
+        node_to_trace['source'][node_to_dim] = []
+        node_to_trace['source'][node_to_dim].append({node_from_idx: node_from_dim})
+        node_to_trace['source'][node_to_dim].extend(node_from_trace['source'][node_from_dim])
+    
+    def _inherit_all_computation(self, node_from, node_to):
+        node_from_compute = self._find_compute_trace_from_node(node_from)
+        node_to_compute = self._find_compute_trace_from_node(node_to)
+        assert len(node_from_compute) == len(node_to_compute)
+        for i in range(len(node_from_compute)):
+            self._add_source(node_from, i, node_to, i)
+            node_to_compute[i] = copy.deepcopy(node_from_compute[i])
+    
+    def _add_source(self, node_from, node_from_dim, node_to, node_to_dim):
+        node_from_dim = self._transform_index(node_from, node_from_dim)
+        node_from_trace = self._find_trace_from_node(node_from)
+        node_to_dim = self._transform_index(node_to, node_to_dim)
+        node_to_trace = self._find_trace_from_node(node_to)
+        node_from_idx = _find_idx_by_name(node_from.name, self.nodes_list)
+        node_to_trace['source'][node_to_dim].append({node_from_idx: node_from_dim})
+        node_to_trace['source'][node_to_dim].extend(node_from_trace['source'][node_from_dim])
+    
+    def _mark_computation_from_node(self, node_from, node_to, exclude=None):
+        if exclude == None:
+            exclude = []
+        else:
+            exclude = [self._transform_index(node_to, i) for i in exclude]
+        node_from_compute = self._find_compute_trace_from_node(node_from)
+        node_to_compute = self._find_compute_trace_from_node(node_to)
+        # assert len(node_from_compute) == len(node_to_compute)
+        for i in range(-1, -min(len(node_from_compute), len(node_to_compute)) - 1, -1):
+            if self._transform_index(node_to, i) in exclude:
+                continue
+            self._add_source(node_from, i, node_to, i)
+            for j in node_from_compute[i]:
+                if j not in node_to_compute[i]:
+                    node_to_compute[i].append(j)
+    
+    def _mark_idx_equal(self, node1, dim1, node2, dim2):
         """
         Mark 2 index to be equal.
 
         Args:
             idx1 (int): index count.
             idx2 (int): index count.
-        """        
-        self.idx_trace_equal.append((idx1, idx2))
+        """
+        # node1_idx = _find_idx_by_name(node1.name, self.nodes_list)
+        # node2_idx = _find_idx_by_name(node2.name, self.nodes_list)
+        # if node1_idx > node2_idx:
+        #     self._add_source(node2, dim2, node1, dim1)
+        # else:
+        #     self._add_source(node1, dim1, node2, dim2)
         
     def _mark_computation(self, node, idx, dim):
         """
@@ -189,16 +254,14 @@ def _mark_computation(self, node, idx, dim):
             idx (int): node index
             dim (list or int): dims to be marked as computed
         """        
-        input_node_idx_trace = self._find_idx_trace_from_node(node)
         if isinstance(dim, int):
             dim = [dim]
+        dims = list(range(len(_get_node_shape(node))))
         for d in dim:
-            cur_idx = input_node_idx_trace[d]
-            if cur_idx not in self.idx_trace_list[idx]['compute']:
-                self.idx_trace_list[idx]['compute'][cur_idx] = [idx]
-            else:
-                self.idx_trace_list[idx]['compute'][cur_idx].append(idx)
-    
+            cur_dim = dims[d]
+            if idx not in self.idx_trace_list[idx]['compute'][cur_dim]:
+                self.idx_trace_list[idx]['compute'][cur_dim].append(idx)
+
     def _find_trace_from_node(self, node):
         """
         Find node idx and compute trace by the node.
@@ -211,7 +274,7 @@ def _find_trace_from_node(self, node):
         """        
         node_idx = _find_idx_by_name(node.name, self.nodes_list)
         node_dict = self.idx_trace_list[node_idx]
-        return node_dict['idx'], node_dict['compute']
+        return node_dict
     
     def _find_idx_trace_from_node(self, node):
         """
@@ -237,19 +300,23 @@ def _find_compute_trace_from_node(self, node):
         node_idx = _find_idx_by_name(node.name, self.nodes_list)
         return self.idx_trace_list[node_idx]['compute']
     
-    def _assign_index_as_input(self, node, node_idx):
+    def _assign_index_as_input(self, node, node_idx, input_node=None):
         """
         Assign node's trace as its input node.
 
         Args:
             node (node)
             node_idx (int)
-        """        
-        input_node_idx = _find_idx_by_name(node.args[0].name, self.nodes_list)
+        """
+        if input_node == None:
+            input_node = node.args[0]
+        input_node_idx = _find_idx_by_name(input_node.name, self.nodes_list)
         input_node_idx_trace = self.idx_trace_list[input_node_idx]['idx']
         
         new_idx_trace = copy.deepcopy(input_node_idx_trace)
         self.idx_trace_list[node_idx]['idx'] = new_idx_trace
+        
+        self._inherit_all_computation(input_node, node)
     
     def _assign_all_index(self, node, node_idx):
         """
@@ -275,15 +342,12 @@ def _assign_transpose_index(self, node, node_idx):
             node (node)
             node_idx (int)
         """  
+        input_node = node.args[0]
         tranpose_dim = node.args[1:]
-        input_node_idx_trace = self._find_idx_trace_from_node(node.args[0])
         
-        new_idx_trace = copy.deepcopy(input_node_idx_trace)
-        new_idx_trace[tranpose_dim[0]] = input_node_idx_trace[tranpose_dim[1]]
-        new_idx_trace[tranpose_dim[1]] = input_node_idx_trace[tranpose_dim[0]]
-
-        self.idx_trace_list[node_idx]['idx'] = new_idx_trace
-        self._inherit_computation(node.args[0], node)
+        self._assign_index_as_input(node, node_idx, input_node)
+        self._inherit_index(input_node, tranpose_dim[1], node, tranpose_dim[0])
+        self._inherit_index(input_node, tranpose_dim[0], node, tranpose_dim[1])
         
     def _assign_permute_index(self, node, node_idx):
         """
@@ -296,14 +360,11 @@ def _assign_permute_index(self, node, node_idx):
             node_idx (int)
         """  
         permute_dim = node.args[1:]
-        input_node_idx_trace = self._find_idx_trace_from_node(node.args[0])
+        input_node = node.args[0]
         
-        new_idx_trace = copy.deepcopy(input_node_idx_trace)
+        self._assign_index_as_input(node, node_idx, input_node)
         for idx, d in enumerate(permute_dim):
-            new_idx_trace[idx] = input_node_idx_trace[d]
-
-        self.idx_trace_list[node_idx]['idx'] = new_idx_trace
-        self._inherit_computation(node.args[0], node)
+            self._inherit_index(input_node, d, node, idx)
         
     def _assign_linear_index(self, node, node_idx):
         """
@@ -321,20 +382,15 @@ def _assign_linear_index(self, node, node_idx):
             bias = None
         else:
             input_node, weight, bias = node.args
-        input_node_idx_trace = self._find_idx_trace_from_node(input_node)
-        weight_idx_trace = self._find_idx_trace_from_node(weight)
         
-        new_idx_trace = copy.deepcopy(input_node_idx_trace)
-        new_idx_trace[-1] = weight_idx_trace[1]
-        self.idx_trace_list[node_idx]['idx'] = new_idx_trace
+        self._assign_index_as_input(node, node_idx)
+        self._inherit_index(weight, 1, node, -1)
 
-        self._inherit_computation(input_node, node)
         self._mark_computation(node, node_idx, [-1])
-        self._mark_idx_equal(input_node_idx_trace[-1], weight_idx_trace[0])
+        self._mark_idx_equal(input_node, -1, weight, 0)
         
         if bias:
-            bias_idx_trace = self._find_idx_trace_from_node(bias)
-            self._mark_idx_equal(input_node_idx_trace[-1], bias_idx_trace[0])
+            self._mark_idx_equal(input_node, -1, bias, 0)
 
     def _assign_matmul_index(self, node, node_idx):
         """
@@ -348,18 +404,14 @@ def _assign_matmul_index(self, node, node_idx):
             node_idx (int)
         """  
         matmul_left, matmul_right = node.args
-        matmul_left_idx_trace = self._find_idx_trace_from_node(matmul_left)
-        matmul_right_idx_trace = self._find_idx_trace_from_node(matmul_right)
         
-        assert(len(matmul_left_idx_trace) == len(matmul_right_idx_trace))
-        new_idx_trace = copy.deepcopy(matmul_left_idx_trace)
-        new_idx_trace[-1] = matmul_right_idx_trace[-1]
-        self.idx_trace_list[node_idx]['idx'] = new_idx_trace
+        assert(len(_get_node_shape(matmul_left)) == len(_get_node_shape(matmul_right)))
+        self._assign_index_as_input(node, node_idx, matmul_left)
+        self._inherit_index(matmul_right, -1, node, -1)
 
-        self._inherit_computation(matmul_left, node)
-        self._inherit_computation(matmul_right, node)
+        self._mark_computation_from_node(matmul_right, node, [-1, -2])
         self._mark_computation(node, node_idx, [-1])
-        self._mark_idx_equal(matmul_left_idx_trace[-1], matmul_right_idx_trace[-2])
+        self._mark_idx_equal(matmul_left, -1, matmul_right, -2)
 
     def _assign_layernorm_index(self, node, idx):
         """
@@ -372,7 +424,6 @@ def _assign_layernorm_index(self, node, idx):
             node_idx (int)
         """
         self._assign_index_as_input(node, idx)
-        self._inherit_computation(node.args[0], node)
         self._mark_computation(node, idx, [-1, -2])
     
     def _assign_elementwise_index(self, node, idx):
@@ -386,9 +437,59 @@ def _assign_elementwise_index(self, node, idx):
             node_idx (int)
         """  
         self._assign_index_as_input(node, idx)
+        nodes_in = []
         for node_in in node.args:
-            if type(node_in) not in (int, float):
-                self._inherit_computation(node_in, node)
+            if type(node_in) == type(node):
+                nodes_in.append(node_in)
+                self._mark_computation_from_node(node_in, node)
+        assert len(nodes_in) <= 2
+        if len(nodes_in) == 2:
+            node_in0_shape = _get_node_shape(nodes_in[0])
+            node_in1_shape = _get_node_shape(nodes_in[1])
+            for i in range(-1, -min(len(node_in0_shape), len(node_in1_shape)) - 1, -1):
+                if node_in0_shape[i] == node_in1_shape[i]:
+                    self._mark_idx_equal(nodes_in[0], i, nodes_in[1], i)
+    
+    def _assgin_no_change_index(self, node, idx):
+        self._assign_index_as_input(node, idx)
+        for node_in in node.args:
+            if type(node_in) == type(node):
+                self._mark_computation_from_node(node_in, node)
+            
+    def _assign_einsum_index(self, node, idx):
+        """
+        Assign index for einsum op.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        patterns = node.args[0]
+        input_nodes = node.args[1:]
+        
+        patterns = patterns.replace(" ", "")
+        left, right = patterns.split("->")
+        left = left.split(",")
+        
+        all_index = []
+        for i in left:
+            for c in i:
+                all_index.append(c)
+        all_index = set(all_index)
+        free_index = set([i for i in right])
+        sum_index = all_index - free_index
+        
+        for right_idx, right_indice in enumerate(right):
+            for left_idx, left_str in enumerate(left):
+                if right_indice in left_str:
+                    source_idx = left_str.index(right_indice)
+                    self._inherit_index(input_nodes[left_idx], source_idx, node, right_idx)
+        
+        for i in sum_index:
+            for left_idx, left_str in enumerate(left):
+                if i in left_str:
+                    self._mark_computation(node, idx, left_str.index(i))
+                    break
                 
     def _assign_softmax_index(self, node, idx):
         """
@@ -401,7 +502,6 @@ def _assign_softmax_index(self, node, idx):
             node_idx (int)
         """  
         self._assign_index_as_input(node, idx)
-        self._inherit_computation(node.args[0], node)
         self._mark_computation(node, idx, [node.kwargs['dim']])
         
     def _assign_unsqueeze_index(self, node, node_idx):
@@ -412,10 +512,12 @@ def _assign_unsqueeze_index(self, node, node_idx):
         Args:
             node (node)
             node_idx (int)
-        """ 
+        """
+        self._del_dim(node_idx, -1)
         self._assign_index_as_input(node, node_idx)
-        self._inherit_computation(node.args[0], node)
         self.idx_trace_list[node_idx]['idx'].insert(node.args[1], self._add_index())
+        self.idx_trace_list[node_idx]['compute'].insert(node.args[1], [])
+        self.idx_trace_list[node_idx]['source'].insert(node.args[1], [])
         
     def _assign_dropout_index(self, node, node_idx):
         """
@@ -427,7 +529,6 @@ def _assign_dropout_index(self, node, node_idx):
             node_idx (int)
         """ 
         self._assign_index_as_input(node, node_idx)
-
         
     def _assign_ones_like_index(self, node, node_idx):
         """
@@ -439,17 +540,6 @@ def _assign_ones_like_index(self, node, node_idx):
             node_idx (int)
         """ 
         self._assign_all_index(node, node_idx)
-        
-    def _assign_to_index(self, node, node_idx):
-        """
-        Assign index for to op.
-        1. assign new index for all dim
-
-        Args:
-            node (node)
-            node_idx (int)
-        """ 
-        self._assign_index_as_input(node, node_idx)
 
     def _assign_view_reshape_index(self, node, node_idx):
         """
@@ -494,26 +584,26 @@ def _assign_view_reshape_index(self, node, node_idx):
             dim_equal = [i == j for i, j in zip(origin_shape[:-1], target_shape)]
             dim_to = [dim_equal.index(False)]
             dim_from = [dim_equal.index(False), dim_equal.index(False) + 1]
+            self._add_dim(node_idx, -1)
         elif len_diff == -1:
             # dim expand
             dim_equal = [i == j for i, j in zip(origin_shape, target_shape[:-1])]
             dim_from = [dim_equal.index(False)]
             dim_to = [dim_equal.index(False), dim_equal.index(False) + 1]
+            self._del_dim(node_idx, -1)
         else:
             raise NotImplementedError("shape" + str(origin_shape) + 'and' + str(target_shape) + "view not implemented")
 
         # get new index
         origin_trace = self._find_idx_trace_from_node(origin_node)
-        new_trace = copy.deepcopy(origin_trace)
+        self._assign_index_as_input(node, node_idx, origin_node)
         dim_from.reverse()
         for i in dim_from:
-            new_trace.pop(i)
+            self._del_dim(node_idx, i)
         for i in dim_to:
-            new_trace.insert(i, self._add_index())
-        self.idx_trace_list[node_idx]['idx'] = new_trace
+            self._add_dim(node_idx, i)
         
         # inherit computation
-        self._inherit_computation(origin_node, node)
         compute_log = self._find_compute_trace_from_node(origin_node)
         for i in dim_from:
             if origin_trace[i] in compute_log:
@@ -524,15 +614,10 @@ def _assign_view_reshape_index(self, node, node_idx):
         # log view, not used now
         view_dict = {"idx_from": [origin_trace[i] for i in dim_from],
                      "dim_from": dim_from,
-                     "idx_to": [new_trace[i] for i in dim_to],
+                     "idx_to": [self.idx_trace_list[node_idx]['idx'][i] for i in dim_to],
                      "dim_to": dim_to}
         self.idx_view_list.append(view_dict) 
-    
-    def _remove_duplicate_compute(self):
-        for i in self.idx_trace_list:
-            for k, v in i['compute'].items():
-                i['compute'][k] = list(set(v))
-    
+
     def _merge_equal_idx(self):
         idx_equal = copy.deepcopy(self.idx_trace_equal)
         idx_equal.reverse()
@@ -556,8 +641,8 @@ def trace_index(self):
                     self._assign_view_reshape_index(node, idx)
                 elif 'unsqueeze' in node.name:
                     self._assign_unsqueeze_index(node, idx)
-                elif 'to' in node.name:
-                    self._assign_to_index(node, idx)
+                elif any(i in node.name for i in ['to', 'contiguous']):
+                    self._assgin_no_change_index(node, idx)
                 else:
                     raise NotImplementedError(node.name, "method not implemented yet!")
             elif node.op == 'call_function':
@@ -573,6 +658,8 @@ def trace_index(self):
                     self._assign_ones_like_index(node, idx)
                 elif 'dropout' in node.name:
                     self._assign_dropout_index(node, idx)
+                elif 'einsum' in node.name:
+                    self._assign_einsum_index(node, idx)
                 elif 'getattr' in node.name:
                     continue # get attr like shape
                 elif 'getitem' in node.name:
@@ -590,10 +677,20 @@ def trace_index(self):
                 continue
             else:
                 raise NotImplementedError(node.op, "op not implemented yet!")
-            
-        self._remove_duplicate_compute()
-        self._merge_equal_idx()
-
+        # self._merge_equal_idx()
+        
+    def check_index(self, trace_idx, start_idx, end_idx):
+        for i in range(start_idx, end_idx + 1):
+            cur_idx = self.idx_trace_list[i]['idx']
+            cur_compute = self.idx_trace_list[i]['compute']
+            if trace_idx in cur_compute:
+                for j in cur_compute[trace_idx]:
+                    if j < start_idx or j > end_idx:
+                        return False
+            # same_idx = [1 if j == trace_idx else 0 for j in cur_idx]
+            # if sum(same_idx) > 1:
+            #     return False
+        return True
 
 class MemoryEstimator(object):
     def __init__(self) -> None:
@@ -897,6 +994,8 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
                 self._is_not_compute(after_trace, (start_idx, end_idx), i) and
                 self.node_list[end_idx].meta['tensor_meta'].shape[i] != 1):
                 continue
+            if not self.index_tracer.check_index(before_trace['idx'][i], start_idx, end_idx):
+                continue
             flow_flag, chunk_info = self._detect_flow(before_trace, after_trace, start_idx, end_idx, i)
             if flow_flag == None:
                 continue
@@ -910,7 +1009,10 @@ def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
         input_trace = []
         for i, n in enumerate(self.node_list):
             if len(n.args) > 0 and n.op != 'output':
-                input_idx = _find_idx_by_name(n.args[0].name, self.node_list)
+                if isinstance(n.args[0], str):
+                    input_idx = _find_idx_by_name(n.args[1].name, self.node_list)
+                else:
+                    input_idx = _find_idx_by_name(n.args[0].name, self.node_list)
                 input_trace.append(output_trace[input_idx])
             else:
                 input_trace.append(None)
@@ -930,7 +1032,7 @@ def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
                 if len(free_dim) > 0:
                     free_dim = [free_dim[0]]
                     chunk_info = [chunk_info[0]]
-                possible_chunk_region.append({'region': (start_idx, end_idx), 'dim': free_dim, 'chunk_info': chunk_info})
+                    possible_chunk_region.append({'region': (start_idx, end_idx), 'dim': free_dim, 'chunk_info': chunk_info})
         return possible_chunk_region
     
     def _search_best_chunk_region(self, possible_chunk_regions):
@@ -1130,6 +1232,7 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
 
         if node_idx in chunk_starts:
             within_chunk_region = True
+            region_idx = chunk_starts.index(node_idx)
                 
             # add for loop
             chunk_input_meta = [meta_nodes[i] for i in chunk_inputs_idx[region_idx]]
@@ -1150,7 +1253,6 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
         if node_idx in chunk_ends:
             body.append(_gen_loop_end(node, chunk_inputs[region_idx], node_list, chunk_dims[region_idx]))
             within_chunk_region = False
-            region_idx += 1
 
         node_idx += 1
 

From 929445116a14d30ebbd50c5978a8f4db52ab3cd6 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Sat, 10 Dec 2022 17:29:51 +0800
Subject: [PATCH 028/209] pass outproduct mean

---
 chunk_codegen.py | 317 +++++++++++++++++++++++++++++++----------------
 1 file changed, 212 insertions(+), 105 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index ce7d849178d1..fc3c88cf91f6 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -16,16 +16,31 @@ def _delete_free_var_from_last_use(user_to_last_uses):
             if n.op == 'placeholder':
                 user_to_last_uses[key].remove(n)
 
+
 def _get_node_shape(node):
     if hasattr(node.meta['tensor_meta'], "shape"):
         return node.meta['tensor_meta'].shape
     return None
 
 
+def _is_non_compute_node(node):
+    if any(i in node.op for i in ['placeholder', 'get_attr', 'output']) or \
+        any(i in node.name for i in ['getitem', 'getattr']):
+        return True
+    return False
+    
+    
+def _is_non_compute_node_except_placeholder(node):
+    if any(i in node.op for i in ['get_attr', 'output']) or \
+        any(i in node.name for i in ['getitem', 'getattr']):
+        return True
+    return False
+
+
 class FlowTracer(object):
     def __init__(self, gm) -> None:
         self.gm = gm
-        self.nodes_list = list(gm.graph.nodes)
+        self.node_list = list(gm.graph.nodes)
         self.flow_trace = {}
 
     def _add_trace(self, name):
@@ -49,7 +64,7 @@ def _add_outside_depend(self, flow_name, node, outside_depend_node, outside_depe
         raise RuntimeError("node not found")
 
     def _init_trace(self):
-        for i in self.nodes_list:
+        for i in self.node_list:
             if i.op == 'placeholder':
                 self._add_trace(i.name)
                 self._add_node(i.name, i)
@@ -67,7 +82,7 @@ def _is_non_compute_node_except_placeholder(self, node):
         return False
     
     def _find_flow_for_node(self, node):
-        if type(self.nodes_list[0]) != type(node):
+        if type(self.node_list[0]) != type(node):
             return None
         if self._is_non_compute_node_except_placeholder(node):
             return None
@@ -117,7 +132,7 @@ def trace_flow(self):
         # init trace
         self._init_trace()
 
-        for node in self.nodes_list:
+        for node in self.node_list:
             # skip if non compute node
             if all(type(arg) != type(node) or self._is_non_compute_node_except_placeholder(arg) for arg in node.args) \
                 or self._is_non_compute_node(node):
@@ -135,6 +150,41 @@ def trace_flow(self):
                 else:
                     self._add_outside_depend(node_domin_flow, node, arg, node_input_flow)
         return self.flow_trace
+    
+    def _detect_flow(self, start_idx, start_dim, end_idx, end_dim):
+        inputs, outputs = _find_chunk_input_and_output_nodes(self.node_list[start_idx:end_idx + 1])
+        chunk_info = {'region': (start_idx, end_idx),
+                      'inputs': inputs, 'inputs_dim': start_dim,
+                      'outputs': outputs, 'outputs_dim': end_dim,
+                      'args': {}}
+        flow_flag = False
+        
+        for idx in range(start_idx, end_idx + 1):
+            node = self.node_list[idx]
+            mix_flow_var = self.get_flow_mix(node)
+            if mix_flow_var is None:
+                continue
+            
+            # if there is a flow mix, op must be in [mul, add, div, matmul]
+            # element-wise op requires dim to be equal in every dim
+            if any(n in node.name for n in ['mul', 'add']):
+                for i in node.args:
+                    if type(i) == type(mix_flow_var) and i != mix_flow_var:
+                        main_flow_var = i
+                # if mix flow is a broadcast in chunk dim, 
+                # TODO need to move that flow out of the chunk
+                if mix_flow_var.meta['tensor_meta'].shape[dim_idx] == 1:
+                    flow_flag = True
+                    for i in self.get_same_flow_node(chunk_info['inputs'], mix_flow_var):
+                        chunk_info['inputs'].remove(i)
+                # else, we need to chunk mix var as well
+                else:
+                    # TODO chunk another value
+                    flow_flag = False
+                    break
+            else:
+                raise NotImplementedError("%s not implemented" % node.name)
+        return flow_flag, chunk_info
 
 
 class IndexTracer(object):
@@ -153,7 +203,7 @@ def _init_idx_trace_list(self):
                 cur_trace = {
                     'idx': [None for _ in range(len(_get_node_shape(n)))],
                     'compute': [[] for _ in range(len(_get_node_shape(n)))],
-                    'source': [[] for _ in range(len(_get_node_shape(n)))],
+                    'source': [{} for _ in range(len(_get_node_shape(n)))],
                 }
             else:
                 cur_trace = {'idx': [], 'compute': [], 'source': []}
@@ -178,7 +228,7 @@ def _del_dim(self, idx, dim_idx):
     def _add_dim(self, idx, dim_idx):
         self.idx_trace_list[idx]['idx'].insert(dim_idx, self._add_index())
         self.idx_trace_list[idx]['compute'].insert(dim_idx, [])
-        self.idx_trace_list[idx]['source'].insert(dim_idx, [])
+        self.idx_trace_list[idx]['source'].insert(dim_idx, {})
     
     def _transform_index(self, node, node_dim):
         node_idx = self._find_idx_trace_from_node(node)
@@ -192,10 +242,7 @@ def _inherit_index(self, node_from, node_from_dim, node_to, node_to_dim):
         node_to_trace = self._find_trace_from_node(node_to)
         node_to_trace['idx'][node_to_dim] = node_from_trace['idx'][node_from_dim]
         node_to_trace['compute'][node_to_dim] = copy.deepcopy(node_from_trace['compute'][node_from_dim])
-        node_from_idx = _find_idx_by_name(node_from.name, self.nodes_list)
-        node_to_trace['source'][node_to_dim] = []
-        node_to_trace['source'][node_to_dim].append({node_from_idx: node_from_dim})
-        node_to_trace['source'][node_to_dim].extend(node_from_trace['source'][node_from_dim])
+        self._add_source(node_from, node_from_dim, node_to, node_to_dim, init=True)
     
     def _inherit_all_computation(self, node_from, node_to):
         node_from_compute = self._find_compute_trace_from_node(node_from)
@@ -205,14 +252,16 @@ def _inherit_all_computation(self, node_from, node_to):
             self._add_source(node_from, i, node_to, i)
             node_to_compute[i] = copy.deepcopy(node_from_compute[i])
     
-    def _add_source(self, node_from, node_from_dim, node_to, node_to_dim):
+    def _add_source(self, node_from, node_from_dim, node_to, node_to_dim, init=False):
         node_from_dim = self._transform_index(node_from, node_from_dim)
         node_from_trace = self._find_trace_from_node(node_from)
         node_to_dim = self._transform_index(node_to, node_to_dim)
         node_to_trace = self._find_trace_from_node(node_to)
         node_from_idx = _find_idx_by_name(node_from.name, self.nodes_list)
-        node_to_trace['source'][node_to_dim].append({node_from_idx: node_from_dim})
-        node_to_trace['source'][node_to_dim].extend(node_from_trace['source'][node_from_dim])
+        if init:
+            node_to_trace['source'][node_to_dim] = {}
+        node_to_trace['source'][node_to_dim][node_from_idx] = node_from_dim
+        node_to_trace['source'][node_to_dim].update(node_from_trace['source'][node_from_dim])
     
     def _mark_computation_from_node(self, node_from, node_to, exclude=None):
         if exclude == None:
@@ -485,11 +534,11 @@ def _assign_einsum_index(self, node, idx):
                     source_idx = left_str.index(right_indice)
                     self._inherit_index(input_nodes[left_idx], source_idx, node, right_idx)
         
-        for i in sum_index:
-            for left_idx, left_str in enumerate(left):
-                if i in left_str:
-                    self._mark_computation(node, idx, left_str.index(i))
-                    break
+        # for i in sum_index:
+        #     for left_idx, left_str in enumerate(left):
+        #         if i in left_str:
+        #             self._mark_computation(node, idx, left_str.index(i))
+        #             break
                 
     def _assign_softmax_index(self, node, idx):
         """
@@ -679,18 +728,56 @@ def trace_index(self):
                 raise NotImplementedError(node.op, "op not implemented yet!")
         # self._merge_equal_idx()
         
-    def check_index(self, trace_idx, start_idx, end_idx):
-        for i in range(start_idx, end_idx + 1):
-            cur_idx = self.idx_trace_list[i]['idx']
-            cur_compute = self.idx_trace_list[i]['compute']
-            if trace_idx in cur_compute:
-                for j in cur_compute[trace_idx]:
-                    if j < start_idx or j > end_idx:
-                        return False
-            # same_idx = [1 if j == trace_idx else 0 for j in cur_idx]
-            # if sum(same_idx) > 1:
-            #     return False
+    def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node):
+        """
+        Check 2 given index: one index should be source of the other
+        Args:
+            start_idx(int): start node chunk dim
+            start_node(node): start node
+            end_idx(int): end node chunk dim
+            end_node(node): end node
+
+        Returns:
+            bool: True if check pass
+        """
+        start_node_idx = _find_idx_by_name(start_node.name, self.nodes_list)
+        end_node_trace = self._find_trace_from_node(end_node)
+        end_node_trace_source = end_node_trace['source'][end_dim]
+        sorted_source = sorted(end_node_trace_source.items(), key=lambda d:d[0], reverse=True)
+        for node_idx, node_dim in sorted_source:
+            if node_idx == start_node_idx and node_dim == start_dim:
+                return True
+            # it means we meet a node outside the loop, and the node is not input node
+            if node_idx < start_idx:
+                return False
+        return False
+
+    def check_index_compute(self, start_idx, end_dim, end_node, end_idx):
+        """
+        Check 2 given index: check they haven't been computed in the source trace.
+        Args:
+            start_idx(int): start node chunk dim
+            start_node(node): start node
+            end_idx(int): end node chunk dim
+            end_node(node): end node
+
+        Returns:
+            bool: True if check pass
+        """
+        end_node_trace = self._find_trace_from_node(end_node)
+        end_node_compute = end_node_trace['compute'][end_dim]
+        if any(start_idx <= i <= end_idx for i in end_node_compute):
+            return False
         return True
+        # end_node_trace_source = end_node_trace['source'][end_dim]
+        # for node_idx, node_dim in end_node_trace_source.items():
+        #     if node_idx < start_node_idx or node_idx > end_node_idx:
+        #         continue
+        #     compute_list = self.idx_trace_list[node_idx]['compute'][node_dim]
+        #     if any(start_node_idx <= i <= end_node_idx for i in compute_list):
+        #         return False
+        # return True
+
 
 class MemoryEstimator(object):
     def __init__(self) -> None:
@@ -951,88 +1038,81 @@ def _is_not_compute(self, trace, chunk_range, dim_idx):
             return True
         return False
     
-    def _detect_flow(self, before_trace, after_trace, start_idx, end_idx, dim_idx):
-        inputs, outputs = _find_input_and_output_nodes(self.node_list[start_idx:end_idx + 1])
-        chunk_info = {'inputs': inputs, 'outputs': outputs}
-        flow_flag = False
-        
-        for idx in range(start_idx, end_idx + 1):
-            node = self.node_list[idx]
-            mix_flow_var = self.flow_tracer.get_flow_mix(node)
-            if mix_flow_var is None:
-                continue
-            
-            # if there is a flow mix, op must be in [mul, add, div, matmul]
-            # element-wise op requires dim to be equal in every dim
-            if any(n in node.name for n in ['mul', 'add']):
-                for i in node.args:
-                    if type(i) == type(mix_flow_var) and i != mix_flow_var:
-                        main_flow_var = i
-                # if mix flow is a broadcast in chunk dim, 
-                # TODO need to move that flow out of the chunk
-                if mix_flow_var.meta['tensor_meta'].shape[dim_idx] == 1:
-                    flow_flag = True
-                    for i in self.flow_tracer.get_same_flow_node(chunk_info['inputs'], mix_flow_var):
-                        chunk_info['inputs'].remove(i)
-                # else, we need to chunk mix var as well
-                else:
-                    # TODO chunk another value
-                    flow_flag = False
-                    break
-            else:
-                raise NotImplementedError("%s not implemented" % node.name)
-        return flow_flag, chunk_info
+    def _check_duplicate_map(self, chunk_infos):
+        dim_map = [(i['inputs_dim'], i['outputs_dim']) for i in chunk_infos]
+        remove_list = []
+        for idx1, (input_dim1, output_dim1) in enumerate(dim_map):
+            for idx2, (input_dim2, output_dim2) in enumerate(dim_map):
+                if idx1 == idx2:
+                    continue
+                # it means an index create 2 copy of itself
+                # eg. a = torch.matmul(x, x.transpose(-1, -2))
+                # TODO currently remove it, deal with this in future
+                if input_dim1 == input_dim2 and output_dim1 != output_dim2:
+                    remove_list.append(chunk_infos[idx1])
+                    remove_list.append(chunk_infos[idx2])
+        for i in remove_list:
+            if i in chunk_infos:
+                chunk_infos.remove(i)
+        return chunk_infos
     
     def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
-        before_trace = input_trace[start_idx]
-        after_trace = output_trace[end_idx]
-        free_dim = []
+        start_traces = input_trace[start_idx]
+        end_trace = output_trace[end_idx]
+        end_node = self.node_list[end_idx]
         chunk_infos = []
-        for i in range(min(len(before_trace['idx']), len(after_trace['idx']))):
-            if not (before_trace['idx'][i] == after_trace['idx'][i] and 
-                self._is_not_compute(before_trace, (start_idx, end_idx), i) and
-                self._is_not_compute(after_trace, (start_idx, end_idx), i) and
-                self.node_list[end_idx].meta['tensor_meta'].shape[i] != 1):
-                continue
-            if not self.index_tracer.check_index(before_trace['idx'][i], start_idx, end_idx):
+        for end_dim, end_trace_idx in enumerate(end_trace['idx']):
+            if len(start_traces) > 1:
+                # TODO implement multi input chunk
                 continue
-            flow_flag, chunk_info = self._detect_flow(before_trace, after_trace, start_idx, end_idx, i)
-            if flow_flag == None:
-                continue
-            chunk_infos.append(chunk_info)
-            free_dim.append(i)
-        return free_dim, chunk_infos
+            for start_node, start_trace in start_traces.items():
+                for start_dim, start_trace_idx in enumerate(start_trace['idx']):
+                    # must be same trace idx
+                    if start_trace_idx != end_trace_idx:
+                        continue
+                    # dim size cannot be 1
+                    if _get_node_shape(end_node)[end_dim] == 1 or \
+                        _get_node_shape(start_node)[start_dim] == 1:
+                        continue
+                    # check index source align
+                    if not self.index_tracer.check_index_source(
+                        start_dim, start_node, start_idx, end_dim, end_node):
+                        continue
+                    # check index copmute
+                    if not self.index_tracer.check_index_compute(
+                        start_idx, end_dim, end_node, end_idx):
+                        continue
+                    # detect flow meet
+                    flow_flag, chunk_info = self.flow_tracer._detect_flow(
+                        start_idx, start_dim, end_idx, end_dim)
+                    if flow_flag:
+                        continue
+                    chunk_infos.append(chunk_info)
+        chunk_infos = self._check_duplicate_map(chunk_infos)
+        return chunk_infos
 
     def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
         possible_chunk_region = []
         output_trace = copy.deepcopy(self.index_tracer.idx_trace_list)
-        input_trace = []
-        for i, n in enumerate(self.node_list):
-            if len(n.args) > 0 and n.op != 'output':
-                if isinstance(n.args[0], str):
-                    input_idx = _find_idx_by_name(n.args[1].name, self.node_list)
-                else:
-                    input_idx = _find_idx_by_name(n.args[0].name, self.node_list)
-                input_trace.append(output_trace[input_idx])
-            else:
-                input_trace.append(None)
-
-        for start_idx in range(max_chunk_region[0], peak_node):
+        input_trace = []  # trace of a node's input nodes
+        for _, n in enumerate(self.node_list):
+            cur_trace = {}
+            for arg in n.args:
+                if type(arg) == type(n) and not _is_non_compute_node_except_placeholder(arg):
+                    cur_trace[arg] = self.index_tracer._find_trace_from_node(arg)
+            input_trace.append(cur_trace)
+
+        for start_idx in range(max_chunk_region[0], peak_node + 1):
             for end_idx in range(peak_node, max_chunk_region[1] + 1):
                 # skip non compute nodes
-                if any(op in ['placeholder', 'get_attr', 'output'] for op in 
-                       [self.node_list[start_idx].op, self.node_list[end_idx].op]):
-                    continue
-                if any(any(i in name for i in ['getitem', 'getattr']) for name in 
-                       [self.node_list[start_idx].name, self.node_list[end_idx].name]):
+                if _is_non_compute_node(self.node_list[start_idx]) or \
+                    _is_non_compute_node(self.node_list[end_idx]):
                     continue
                 
                 # select free dim
-                free_dim, chunk_info = self._find_free_dim(input_trace, output_trace, start_idx, end_idx)
-                if len(free_dim) > 0:
-                    free_dim = [free_dim[0]]
-                    chunk_info = [chunk_info[0]]
-                    possible_chunk_region.append({'region': (start_idx, end_idx), 'dim': free_dim, 'chunk_info': chunk_info})
+                chunk_info = self._find_free_dim(input_trace, output_trace, start_idx, end_idx)
+                if len(chunk_info) > 0:
+                    possible_chunk_region.extend(chunk_info)
         return possible_chunk_region
     
     def _search_best_chunk_region(self, possible_chunk_regions):
@@ -1044,7 +1124,8 @@ def _search_best_chunk_region(self, possible_chunk_regions):
                 max_region_range = i['region'][1] - i['region'][0]
         return best_regions
     
-    def _step_search(self, peak_node, active_node):
+    def _step_search(self, mem_peak, active_node):
+        peak_node = self._find_peak_node(mem_peak)
         max_chunk_region = self._search_max_chunk_region(active_node, peak_node)
         possible_chunk_regions = self._search_possible_chunk_regions(max_chunk_region, peak_node)
         best_chunk_region = self._search_best_chunk_region(possible_chunk_regions)
@@ -1062,19 +1143,16 @@ def search_region(self):
         mem_peak = init_mem_peak
         
         while True:
-            peak_node = self._find_peak_node(mem_peak)
-            chunk_region = self._step_search(peak_node, active_node)
-            if chunk_region is None or len(chunk_region['dim']) == 0:
+            chunk_region = self._step_search(mem_peak, active_node)
+            if chunk_region is None:
                 break
             
             chunk_regions.append(chunk_region)
             mem_peak, _, active_node = self.memory_estimator.estimate_chunk_inference_mem(
                 self.gm, [i['region'][0] for i in chunk_regions], 
-                [i['region'][1] for i in chunk_regions], [i['dim'][0] for i in chunk_regions], [1] * len(chunk_regions))
-            
+                [i['region'][1] for i in chunk_regions], [i['inputs_dim'] for i in chunk_regions], [1] * len(chunk_regions))
             if self._stop_search(init_mem_peak, mem_peak):
                 break
-
         return chunk_regions
 
 
@@ -1164,6 +1242,35 @@ def _find_input_and_output_nodes(nodes: List[Node]):
     return input_nodes, output_nodes
 
 
+def _find_chunk_input_and_output_nodes(nodes: List[Node]):
+    """
+    Find non-compute input and output node names.
+    input nodes are nodes used in the list
+    output nodes are nodes will use nodes in the list
+    """
+    input_nodes = []
+    output_nodes = []
+
+    # if a node has an input node which is not in the node list
+    # we treat that input node as the input of the checkpoint function
+    for node in nodes:
+        for input_node in node._input_nodes.keys():
+            if input_node not in nodes and input_node not in input_nodes \
+                and not _is_non_compute_node_except_placeholder(input_node):
+                input_nodes.append(input_node)
+
+    # if a node has a user node which is not in the node list
+    # we treat that user node as the node receiving the current node output
+    # TODO it is unsafe to remove non compute node here
+    for node in nodes:
+        for output_node in node.users.keys():
+            if output_node not in nodes and node not in output_nodes \
+                and not _is_non_compute_node_except_placeholder(input_node):
+                output_nodes.append(node)
+
+    return input_nodes, output_nodes
+
+
 def _find_idx_by_name(name, nodes_list):
     for idx, node in enumerate(nodes_list):
         if node.name == name:

From d31e146687ebd4cefdc67500e84b7414b5760dd4 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Sat, 10 Dec 2022 17:34:40 +0800
Subject: [PATCH 029/209] code format

---
 chunk_codegen.py | 908 +++++++++++++++++++++++++++++------------------
 1 file changed, 560 insertions(+), 348 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index fc3c88cf91f6..e8cf0d22f157 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -4,35 +4,52 @@
 from typing import List, Callable, Any, Tuple, Dict, Iterable
 
 from torch.fx.node import Node, Argument, map_arg, _type_repr, _get_qualified_name
-from torch.fx.graph import _Namespace, PythonCode, _custom_builtins, _is_from_torch, _format_target, magic_methods, CodeGen, _origin_type_map, inplace_methods, _CustomBuiltin
-from colossalai.fx.profiler import calculate_fwd_out, calculate_fwd_tmp, parameter_size, activation_size
+from torch.fx.graph import (
+    _Namespace,
+    PythonCode,
+    _custom_builtins,
+    _is_from_torch,
+    _format_target,
+    magic_methods,
+    CodeGen,
+    _origin_type_map,
+    inplace_methods,
+    _CustomBuiltin,
+)
+from colossalai.fx.profiler import (
+    calculate_fwd_out,
+    calculate_fwd_tmp,
+    parameter_size,
+    activation_size,
+)
+
 CODEGEN_AVAILABLE = True
-__all__ = ['ChunkCodeGen']
+__all__ = ["ChunkCodeGen"]
 
 
 def _delete_free_var_from_last_use(user_to_last_uses):
     for key, value in user_to_last_uses.items():
         for n in value:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 user_to_last_uses[key].remove(n)
 
 
 def _get_node_shape(node):
-    if hasattr(node.meta['tensor_meta'], "shape"):
-        return node.meta['tensor_meta'].shape
+    if hasattr(node.meta["tensor_meta"], "shape"):
+        return node.meta["tensor_meta"].shape
     return None
 
 
 def _is_non_compute_node(node):
-    if any(i in node.op for i in ['placeholder', 'get_attr', 'output']) or \
-        any(i in node.name for i in ['getitem', 'getattr']):
+    if any(i in node.op for i in ["placeholder", "get_attr", "output"]) or any(
+        i in node.name for i in ["getitem", "getattr"]
+    ):
         return True
     return False
-    
-    
+
+
 def _is_non_compute_node_except_placeholder(node):
-    if any(i in node.op for i in ['get_attr', 'output']) or \
-        any(i in node.name for i in ['getitem', 'getattr']):
+    if (any(i in node.op for i in ["get_attr", "output"]) or any(i in node.name for i in ["getitem", "getattr"])):
         return True
     return False
 
@@ -45,42 +62,48 @@ def __init__(self, gm) -> None:
 
     def _add_trace(self, name):
         self.flow_trace[name] = []
-    
+
     def _add_node(self, trace_name, node):
-        self.flow_trace[trace_name].append({'node': node, 'inside_depend': [], 'outside_depend': []})
-    
+        self.flow_trace[trace_name].append(
+            {"node": node, "inside_depend": [], "outside_depend": []}
+        )
+
     def _add_inside_depend(self, flow_name, node, inside_depend_node):
         for i in self.flow_trace[flow_name]:
-            if i['node'] == node:
-                i['inside_depend'].append(inside_depend_node)
+            if i["node"] == node:
+                i["inside_depend"].append(inside_depend_node)
                 return
         raise RuntimeError("node not found")
-                
-    def _add_outside_depend(self, flow_name, node, outside_depend_node, outside_depend_trace):
+
+    def _add_outside_depend(
+        self, flow_name, node, outside_depend_node, outside_depend_trace
+    ):
         for i in self.flow_trace[flow_name]:
-            if i['node'] == node:
-                i['outside_depend'].append({outside_depend_trace: outside_depend_node}) 
+            if i["node"] == node:
+                i["outside_depend"].append({outside_depend_trace: outside_depend_node})
                 return
         raise RuntimeError("node not found")
 
     def _init_trace(self):
         for i in self.node_list:
-            if i.op == 'placeholder':
+            if i.op == "placeholder":
                 self._add_trace(i.name)
                 self._add_node(i.name, i)
 
     def _is_non_compute_node(self, node):
-        if any(i in node.op for i in ['placeholder', 'get_attr', 'output']) or \
-            any(i in node.name for i in ['getitem', 'getattr']):
+        if any(i in node.op for i in ["placeholder", "get_attr", "output"]) or any(
+            i in node.name for i in ["getitem", "getattr"]
+        ):
             return True
         return False
-    
+
     def _is_non_compute_node_except_placeholder(self, node):
-        if any(i in node.op for i in ['get_attr', 'output']) or \
-            any(i in node.name for i in ['getitem', 'getattr']):
+        if any(i in node.op for i in ["get_attr", "output"]) or any(
+            i in node.name for i in ["getitem", "getattr"]
+        ):
             return True
         return False
-    
+
     def _find_flow_for_node(self, node):
         if type(self.node_list[0]) != type(node):
             return None
@@ -88,54 +111,57 @@ def _find_flow_for_node(self, node):
             return None
         for name, trace in self.flow_trace.items():
             for i in trace:
-                if node == i['node']:
+                if node == i["node"]:
                     return name
         if any(i in node.name for i in ["ones_like"]):
             self._add_trace(node.name)
             self._add_node(node.name, node)
             return node.name
         raise RuntimeError("node not found")
-    
+
     def _find_first_valid_flow(self, flow):
         for i in flow:
             if i is not None:
                 return i
         raise RuntimeError("invalid flow")
-    
+
     def find_node_flow(self, node):
         for name, trace in self.flow_trace.items():
             for i in trace:
-                if node == i['node']:
+                if node == i["node"]:
                     return name, i
         raise RuntimeError("invalid node")
-        
+
     def get_flow_mix(self, node):
         if self._is_non_compute_node(node):
             return None
         _, node_trace = self.find_node_flow(node)
-        if len(node_trace['outside_depend']) == 0:
+        if len(node_trace["outside_depend"]) == 0:
             return None
-        elif len(node_trace['outside_depend']) > 1:
+        elif len(node_trace["outside_depend"]) > 1:
             raise NotImplementedError
-        vars = list(node_trace['outside_depend'][0].values())[0]
+        vars = list(node_trace["outside_depend"][0].values())[0]
         return vars
-    
+
     def get_same_flow_node(self, node_list, node):
         name, _ = self.find_node_flow(node)
         result = []
         for i in self.flow_trace[name]:
-            if i['node'] in node_list:
-                result.append(i['node'])
+            if i["node"] in node_list:
+                result.append(i["node"])
         return result
-        
-    def trace_flow(self):    
+
+    def trace_flow(self):
         # init trace
         self._init_trace()
 
         for node in self.node_list:
             # skip if non compute node
-            if all(type(arg) != type(node) or self._is_non_compute_node_except_placeholder(arg) for arg in node.args) \
-                or self._is_non_compute_node(node):
+            if all(
+                type(arg) != type(node)
+                or self._is_non_compute_node_except_placeholder(arg)
+                for arg in node.args
+            ) or self._is_non_compute_node(node):
                 continue
 
             node_input_flows = [self._find_flow_for_node(arg) for arg in node.args]
@@ -148,35 +174,45 @@ def trace_flow(self):
                 elif node_input_flow == node_domin_flow:
                     self._add_inside_depend(node_domin_flow, node, arg)
                 else:
-                    self._add_outside_depend(node_domin_flow, node, arg, node_input_flow)
+                    self._add_outside_depend(
+                        node_domin_flow, node, arg, node_input_flow
+                    )
         return self.flow_trace
-    
+
     def _detect_flow(self, start_idx, start_dim, end_idx, end_dim):
-        inputs, outputs = _find_chunk_input_and_output_nodes(self.node_list[start_idx:end_idx + 1])
-        chunk_info = {'region': (start_idx, end_idx),
-                      'inputs': inputs, 'inputs_dim': start_dim,
-                      'outputs': outputs, 'outputs_dim': end_dim,
-                      'args': {}}
+        inputs, outputs = _find_chunk_input_and_output_nodes(
+            self.node_list[start_idx : end_idx + 1]
+        )
+        chunk_info = {
+            "region": (start_idx, end_idx),
+            "inputs": inputs,
+            "inputs_dim": start_dim,
+            "outputs": outputs,
+            "outputs_dim": end_dim,
+            "args": {},
+        }
         flow_flag = False
-        
+
         for idx in range(start_idx, end_idx + 1):
             node = self.node_list[idx]
             mix_flow_var = self.get_flow_mix(node)
             if mix_flow_var is None:
                 continue
-            
+
             # if there is a flow mix, op must be in [mul, add, div, matmul]
             # element-wise op requires dim to be equal in every dim
-            if any(n in node.name for n in ['mul', 'add']):
+            if any(n in node.name for n in ["mul", "add"]):
                 for i in node.args:
                     if type(i) == type(mix_flow_var) and i != mix_flow_var:
                         main_flow_var = i
-                # if mix flow is a broadcast in chunk dim, 
+                # if mix flow is a broadcast in chunk dim,
                 # TODO need to move that flow out of the chunk
-                if mix_flow_var.meta['tensor_meta'].shape[dim_idx] == 1:
+                if mix_flow_var.meta["tensor_meta"].shape[dim_idx] == 1:
                     flow_flag = True
-                    for i in self.get_same_flow_node(chunk_info['inputs'], mix_flow_var):
-                        chunk_info['inputs'].remove(i)
+                    for i in self.get_same_flow_node(
+                        chunk_info["inputs"], mix_flow_var
+                    ):
+                        chunk_info["inputs"].remove(i)
                 # else, we need to chunk mix var as well
                 else:
                     # TODO chunk another value
@@ -199,51 +235,53 @@ def __init__(self, gm) -> None:
     def _init_idx_trace_list(self):
         idx_trace_list = []
         for n in self.nodes_list:
-            if _get_node_shape(n) != None:            
+            if _get_node_shape(n) != None:
                 cur_trace = {
-                    'idx': [None for _ in range(len(_get_node_shape(n)))],
-                    'compute': [[] for _ in range(len(_get_node_shape(n)))],
-                    'source': [{} for _ in range(len(_get_node_shape(n)))],
+                    "idx": [None for _ in range(len(_get_node_shape(n)))],
+                    "compute": [[] for _ in range(len(_get_node_shape(n)))],
+                    "source": [{} for _ in range(len(_get_node_shape(n)))],
                 }
             else:
-                cur_trace = {'idx': [], 'compute': [], 'source': []}
+                cur_trace = {"idx": [], "compute": [], "source": []}
             idx_trace_list.append(cur_trace)
         return idx_trace_list
-    
+
     def _add_index(self):
         """
         Update the count and return it. To record the idx number.
-        
+
         Returns:
             idx_count: int
-        """        
+        """
         self.idx_count += 1
         return self.idx_count
-    
+
     def _del_dim(self, idx, dim_idx):
-        self.idx_trace_list[idx]['idx'].pop(dim_idx)
-        self.idx_trace_list[idx]['compute'].pop(dim_idx)
-        self.idx_trace_list[idx]['source'].pop(dim_idx)
-    
+        self.idx_trace_list[idx]["idx"].pop(dim_idx)
+        self.idx_trace_list[idx]["compute"].pop(dim_idx)
+        self.idx_trace_list[idx]["source"].pop(dim_idx)
+
     def _add_dim(self, idx, dim_idx):
-        self.idx_trace_list[idx]['idx'].insert(dim_idx, self._add_index())
-        self.idx_trace_list[idx]['compute'].insert(dim_idx, [])
-        self.idx_trace_list[idx]['source'].insert(dim_idx, {})
-    
+        self.idx_trace_list[idx]["idx"].insert(dim_idx, self._add_index())
+        self.idx_trace_list[idx]["compute"].insert(dim_idx, [])
+        self.idx_trace_list[idx]["source"].insert(dim_idx, {})
+
     def _transform_index(self, node, node_dim):
         node_idx = self._find_idx_trace_from_node(node)
         dims = list(range(len(node_idx)))
         return dims[node_dim]
-    
+
     def _inherit_index(self, node_from, node_from_dim, node_to, node_to_dim):
         node_from_dim = self._transform_index(node_from, node_from_dim)
         node_to_dim = self._transform_index(node_to, node_to_dim)
         node_from_trace = self._find_trace_from_node(node_from)
         node_to_trace = self._find_trace_from_node(node_to)
-        node_to_trace['idx'][node_to_dim] = node_from_trace['idx'][node_from_dim]
-        node_to_trace['compute'][node_to_dim] = copy.deepcopy(node_from_trace['compute'][node_from_dim])
+        node_to_trace["idx"][node_to_dim] = node_from_trace["idx"][node_from_dim]
+        node_to_trace["compute"][node_to_dim] = copy.deepcopy(
+            node_from_trace["compute"][node_from_dim]
+        )
         self._add_source(node_from, node_from_dim, node_to, node_to_dim, init=True)
-    
+
     def _inherit_all_computation(self, node_from, node_to):
         node_from_compute = self._find_compute_trace_from_node(node_from)
         node_to_compute = self._find_compute_trace_from_node(node_to)
@@ -251,7 +289,7 @@ def _inherit_all_computation(self, node_from, node_to):
         for i in range(len(node_from_compute)):
             self._add_source(node_from, i, node_to, i)
             node_to_compute[i] = copy.deepcopy(node_from_compute[i])
-    
+
     def _add_source(self, node_from, node_from_dim, node_to, node_to_dim, init=False):
         node_from_dim = self._transform_index(node_from, node_from_dim)
         node_from_trace = self._find_trace_from_node(node_from)
@@ -259,10 +297,12 @@ def _add_source(self, node_from, node_from_dim, node_to, node_to_dim, init=False
         node_to_trace = self._find_trace_from_node(node_to)
         node_from_idx = _find_idx_by_name(node_from.name, self.nodes_list)
         if init:
-            node_to_trace['source'][node_to_dim] = {}
-        node_to_trace['source'][node_to_dim][node_from_idx] = node_from_dim
-        node_to_trace['source'][node_to_dim].update(node_from_trace['source'][node_from_dim])
-    
+            node_to_trace["source"][node_to_dim] = {}
+        node_to_trace["source"][node_to_dim][node_from_idx] = node_from_dim
+        node_to_trace["source"][node_to_dim].update(
+            node_from_trace["source"][node_from_dim]
+        )
+
     def _mark_computation_from_node(self, node_from, node_to, exclude=None):
         if exclude == None:
             exclude = []
@@ -278,7 +318,7 @@ def _mark_computation_from_node(self, node_from, node_to, exclude=None):
             for j in node_from_compute[i]:
                 if j not in node_to_compute[i]:
                     node_to_compute[i].append(j)
-    
+
     def _mark_idx_equal(self, node1, dim1, node2, dim2):
         """
         Mark 2 index to be equal.
@@ -293,7 +333,7 @@ def _mark_idx_equal(self, node1, dim1, node2, dim2):
         #     self._add_source(node2, dim2, node1, dim1)
         # else:
         #     self._add_source(node1, dim1, node2, dim2)
-        
+
     def _mark_computation(self, node, idx, dim):
         """
         Mark some dims of node as computed.
@@ -302,14 +342,14 @@ def _mark_computation(self, node, idx, dim):
             node (node)
             idx (int): node index
             dim (list or int): dims to be marked as computed
-        """        
+        """
         if isinstance(dim, int):
             dim = [dim]
         dims = list(range(len(_get_node_shape(node))))
         for d in dim:
             cur_dim = dims[d]
-            if idx not in self.idx_trace_list[idx]['compute'][cur_dim]:
-                self.idx_trace_list[idx]['compute'][cur_dim].append(idx)
+            if idx not in self.idx_trace_list[idx]["compute"][cur_dim]:
+                self.idx_trace_list[idx]["compute"][cur_dim].append(idx)
 
     def _find_trace_from_node(self, node):
         """
@@ -320,11 +360,11 @@ def _find_trace_from_node(self, node):
         Returns:
             idx (list): idx of the node
             compute (list): computed idx of the node.
-        """        
+        """
         node_idx = _find_idx_by_name(node.name, self.nodes_list)
         node_dict = self.idx_trace_list[node_idx]
         return node_dict
-    
+
     def _find_idx_trace_from_node(self, node):
         """
         Find node idx trace by the node.
@@ -333,10 +373,10 @@ def _find_idx_trace_from_node(self, node):
             node (node)
         Returns:
             idx (list): idx of the node
-        """ 
+        """
         node_idx = _find_idx_by_name(node.name, self.nodes_list)
-        return self.idx_trace_list[node_idx]['idx']
-    
+        return self.idx_trace_list[node_idx]["idx"]
+
     def _find_compute_trace_from_node(self, node):
         """
         Find node compute trace by the node.
@@ -345,10 +385,10 @@ def _find_compute_trace_from_node(self, node):
             node (node)
         Returns:
             compute (list): computed idx of the node.
-        """ 
+        """
         node_idx = _find_idx_by_name(node.name, self.nodes_list)
-        return self.idx_trace_list[node_idx]['compute']
-    
+        return self.idx_trace_list[node_idx]["compute"]
+
     def _assign_index_as_input(self, node, node_idx, input_node=None):
         """
         Assign node's trace as its input node.
@@ -360,13 +400,13 @@ def _assign_index_as_input(self, node, node_idx, input_node=None):
         if input_node == None:
             input_node = node.args[0]
         input_node_idx = _find_idx_by_name(input_node.name, self.nodes_list)
-        input_node_idx_trace = self.idx_trace_list[input_node_idx]['idx']
-        
+        input_node_idx_trace = self.idx_trace_list[input_node_idx]["idx"]
+
         new_idx_trace = copy.deepcopy(input_node_idx_trace)
-        self.idx_trace_list[node_idx]['idx'] = new_idx_trace
-        
+        self.idx_trace_list[node_idx]["idx"] = new_idx_trace
+
         self._inherit_all_computation(input_node, node)
-    
+
     def _assign_all_index(self, node, node_idx):
         """
         Add new index for all node's dims.
@@ -374,12 +414,12 @@ def _assign_all_index(self, node, node_idx):
         Args:
             node (node)
             node_idx (int)
-        """  
-        shape = node.meta['tensor_meta'].shape
+        """
+        shape = node.meta["tensor_meta"].shape
         new_trace = []
         for _ in shape:
             new_trace.append(self._add_index())
-        self.idx_trace_list[node_idx]['idx'] = new_trace   
+        self.idx_trace_list[node_idx]["idx"] = new_trace
 
     def _assign_transpose_index(self, node, node_idx):
         """
@@ -390,14 +430,14 @@ def _assign_transpose_index(self, node, node_idx):
         Args:
             node (node)
             node_idx (int)
-        """  
+        """
         input_node = node.args[0]
         tranpose_dim = node.args[1:]
-        
+
         self._assign_index_as_input(node, node_idx, input_node)
         self._inherit_index(input_node, tranpose_dim[1], node, tranpose_dim[0])
         self._inherit_index(input_node, tranpose_dim[0], node, tranpose_dim[1])
-        
+
     def _assign_permute_index(self, node, node_idx):
         """
         Assign index for permute op.
@@ -407,14 +447,14 @@ def _assign_permute_index(self, node, node_idx):
         Args:
             node (node)
             node_idx (int)
-        """  
+        """
         permute_dim = node.args[1:]
         input_node = node.args[0]
-        
+
         self._assign_index_as_input(node, node_idx, input_node)
         for idx, d in enumerate(permute_dim):
             self._inherit_index(input_node, d, node, idx)
-        
+
     def _assign_linear_index(self, node, node_idx):
         """
         Assign index for linear op.
@@ -431,13 +471,13 @@ def _assign_linear_index(self, node, node_idx):
             bias = None
         else:
             input_node, weight, bias = node.args
-        
+
         self._assign_index_as_input(node, node_idx)
         self._inherit_index(weight, 1, node, -1)
 
         self._mark_computation(node, node_idx, [-1])
         self._mark_idx_equal(input_node, -1, weight, 0)
-        
+
         if bias:
             self._mark_idx_equal(input_node, -1, bias, 0)
 
@@ -451,10 +491,10 @@ def _assign_matmul_index(self, node, node_idx):
         Args:
             node (node)
             node_idx (int)
-        """  
+        """
         matmul_left, matmul_right = node.args
-        
-        assert(len(_get_node_shape(matmul_left)) == len(_get_node_shape(matmul_right)))
+
+        assert len(_get_node_shape(matmul_left)) == len(_get_node_shape(matmul_right))
         self._assign_index_as_input(node, node_idx, matmul_left)
         self._inherit_index(matmul_right, -1, node, -1)
 
@@ -474,7 +514,7 @@ def _assign_layernorm_index(self, node, idx):
         """
         self._assign_index_as_input(node, idx)
         self._mark_computation(node, idx, [-1, -2])
-    
+
     def _assign_elementwise_index(self, node, idx):
         """
         Assign index for element-wise op (eg. relu sigmoid add mul).
@@ -484,7 +524,7 @@ def _assign_elementwise_index(self, node, idx):
         Args:
             node (node)
             node_idx (int)
-        """  
+        """
         self._assign_index_as_input(node, idx)
         nodes_in = []
         for node_in in node.args:
@@ -498,13 +538,13 @@ def _assign_elementwise_index(self, node, idx):
             for i in range(-1, -min(len(node_in0_shape), len(node_in1_shape)) - 1, -1):
                 if node_in0_shape[i] == node_in1_shape[i]:
                     self._mark_idx_equal(nodes_in[0], i, nodes_in[1], i)
-    
+
     def _assgin_no_change_index(self, node, idx):
         self._assign_index_as_input(node, idx)
         for node_in in node.args:
             if type(node_in) == type(node):
                 self._mark_computation_from_node(node_in, node)
-            
+
     def _assign_einsum_index(self, node, idx):
         """
         Assign index for einsum op.
@@ -515,11 +555,11 @@ def _assign_einsum_index(self, node, idx):
         """
         patterns = node.args[0]
         input_nodes = node.args[1:]
-        
+
         patterns = patterns.replace(" ", "")
         left, right = patterns.split("->")
         left = left.split(",")
-        
+
         all_index = []
         for i in left:
             for c in i:
@@ -527,19 +567,21 @@ def _assign_einsum_index(self, node, idx):
         all_index = set(all_index)
         free_index = set([i for i in right])
         sum_index = all_index - free_index
-        
+
         for right_idx, right_indice in enumerate(right):
             for left_idx, left_str in enumerate(left):
                 if right_indice in left_str:
                     source_idx = left_str.index(right_indice)
-                    self._inherit_index(input_nodes[left_idx], source_idx, node, right_idx)
-        
+                    self._inherit_index(
+                        input_nodes[left_idx], source_idx, node, right_idx
+                    )
+
         # for i in sum_index:
         #     for left_idx, left_str in enumerate(left):
         #         if i in left_str:
         #             self._mark_computation(node, idx, left_str.index(i))
         #             break
-                
+
     def _assign_softmax_index(self, node, idx):
         """
         Assign index for softmax op.
@@ -549,10 +591,10 @@ def _assign_softmax_index(self, node, idx):
         Args:
             node (node)
             node_idx (int)
-        """  
+        """
         self._assign_index_as_input(node, idx)
-        self._mark_computation(node, idx, [node.kwargs['dim']])
-        
+        self._mark_computation(node, idx, [node.kwargs["dim"]])
+
     def _assign_unsqueeze_index(self, node, node_idx):
         """
         Assign index for unsqueeze op.
@@ -564,10 +606,10 @@ def _assign_unsqueeze_index(self, node, node_idx):
         """
         self._del_dim(node_idx, -1)
         self._assign_index_as_input(node, node_idx)
-        self.idx_trace_list[node_idx]['idx'].insert(node.args[1], self._add_index())
-        self.idx_trace_list[node_idx]['compute'].insert(node.args[1], [])
-        self.idx_trace_list[node_idx]['source'].insert(node.args[1], [])
-        
+        self.idx_trace_list[node_idx]["idx"].insert(node.args[1], self._add_index())
+        self.idx_trace_list[node_idx]["compute"].insert(node.args[1], [])
+        self.idx_trace_list[node_idx]["source"].insert(node.args[1], [])
+
     def _assign_dropout_index(self, node, node_idx):
         """
         Assign index for unsqueeze op.
@@ -576,9 +618,9 @@ def _assign_dropout_index(self, node, node_idx):
         Args:
             node (node)
             node_idx (int)
-        """ 
+        """
         self._assign_index_as_input(node, node_idx)
-        
+
     def _assign_ones_like_index(self, node, node_idx):
         """
         Assign index for oneslike op.
@@ -587,7 +629,7 @@ def _assign_ones_like_index(self, node, node_idx):
         Args:
             node (node)
             node_idx (int)
-        """ 
+        """
         self._assign_all_index(node, node_idx)
 
     def _assign_view_reshape_index(self, node, node_idx):
@@ -604,16 +646,16 @@ def _assign_view_reshape_index(self, node, node_idx):
         Args:
             node (node)
             node_idx (int)
-        """  
+        """
         # get data, turn into number
         origin_node = node.args[0]
-        origin_shape = origin_node.meta['tensor_meta'].shape
+        origin_shape = origin_node.meta["tensor_meta"].shape
         target_shape = []
         for i in range(1, len(node.args)):
             if isinstance(node.args[i], int):
                 target_shape.append(node.args[i])
             else:
-                target_shape.append(node.args[i].meta['fwd_out'][0])
+                target_shape.append(node.args[i].meta["fwd_out"][0])
 
         # compute the value of -1
         if -1 in target_shape:
@@ -641,7 +683,13 @@ def _assign_view_reshape_index(self, node, node_idx):
             dim_to = [dim_equal.index(False), dim_equal.index(False) + 1]
             self._del_dim(node_idx, -1)
         else:
-            raise NotImplementedError("shape" + str(origin_shape) + 'and' + str(target_shape) + "view not implemented")
+            raise NotImplementedError(
+                "shape"
+                + str(origin_shape)
+                + "and"
+                + str(target_shape)
+                + "view not implemented"
+            )
 
         # get new index
         origin_trace = self._find_idx_trace_from_node(origin_node)
@@ -651,7 +699,7 @@ def _assign_view_reshape_index(self, node, node_idx):
             self._del_dim(node_idx, i)
         for i in dim_to:
             self._add_dim(node_idx, i)
-        
+
         # inherit computation
         compute_log = self._find_compute_trace_from_node(origin_node)
         for i in dim_from:
@@ -659,13 +707,15 @@ def _assign_view_reshape_index(self, node, node_idx):
                 for j in dim_to:
                     self._mark_computation(node, node_idx, [j])
                 break
-        
+
         # log view, not used now
-        view_dict = {"idx_from": [origin_trace[i] for i in dim_from],
-                     "dim_from": dim_from,
-                     "idx_to": [self.idx_trace_list[node_idx]['idx'][i] for i in dim_to],
-                     "dim_to": dim_to}
-        self.idx_view_list.append(view_dict) 
+        view_dict = {
+            "idx_from": [origin_trace[i] for i in dim_from],
+            "dim_from": dim_from,
+            "idx_to": [self.idx_trace_list[node_idx]["idx"][i] for i in dim_to],
+            "dim_to": dim_to,
+        }
+        self.idx_view_list.append(view_dict)
 
     def _merge_equal_idx(self):
         idx_equal = copy.deepcopy(self.idx_trace_equal)
@@ -674,60 +724,64 @@ def _merge_equal_idx(self):
             merge_to = min(idx)
             merge_from = max(idx)
             for trace in self.idx_trace_list:
-                if merge_from in trace['idx']:
-                    trace['idx'] = [merge_to if i == merge_from else i for i in trace['idx']]
-    
+                if merge_from in trace["idx"]:
+                    trace["idx"] = [
+                        merge_to if i == merge_from else i for i in trace["idx"]
+                    ]
+
     def trace_index(self):
         for idx, node in enumerate(self.nodes_list):
-            if node.op == 'placeholder':
+            if node.op == "placeholder":
                 self._assign_all_index(node, idx)
-            elif node.op == 'call_method':
-                if 'transpose' in node.name:
+            elif node.op == "call_method":
+                if "transpose" in node.name:
                     self._assign_transpose_index(node, idx)
-                elif 'permute' in node.name:
+                elif "permute" in node.name:
                     self._assign_permute_index(node, idx)
-                elif 'view' in node.name or 'reshape' in node.name:
+                elif "view" in node.name or "reshape" in node.name:
                     self._assign_view_reshape_index(node, idx)
-                elif 'unsqueeze' in node.name:
+                elif "unsqueeze" in node.name:
                     self._assign_unsqueeze_index(node, idx)
-                elif any(i in node.name for i in ['to', 'contiguous']):
+                elif any(i in node.name for i in ["to", "contiguous"]):
                     self._assgin_no_change_index(node, idx)
                 else:
                     raise NotImplementedError(node.name, "method not implemented yet!")
-            elif node.op == 'call_function':
-                if 'linear' in node.name:
+            elif node.op == "call_function":
+                if "linear" in node.name:
                     self._assign_linear_index(node, idx)
-                elif 'matmul' in node.name:
+                elif "matmul" in node.name:
                     self._assign_matmul_index(node, idx)
-                elif 'softmax' in node.name:
+                elif "softmax" in node.name:
                     self._assign_softmax_index(node, idx)
-                elif any(n in node.name for n in ['mul', 'add', 'sigmoid', 'relu']):
+                elif any(n in node.name for n in ["mul", "add", "sigmoid", "relu"]):
                     self._assign_elementwise_index(node, idx)
-                elif 'ones_like' in node.name:
+                elif "ones_like" in node.name:
                     self._assign_ones_like_index(node, idx)
-                elif 'dropout' in node.name:
+                elif "dropout" in node.name:
                     self._assign_dropout_index(node, idx)
-                elif 'einsum' in node.name:
+                elif "einsum" in node.name:
                     self._assign_einsum_index(node, idx)
-                elif 'getattr' in node.name:
-                    continue # get attr like shape
-                elif 'getitem' in node.name:
-                    continue # get item in list
+                elif "getattr" in node.name:
+                    continue  # get attr like shape
+                elif "getitem" in node.name:
+                    continue  # get item in list
                 else:
-                    raise NotImplementedError(node.name, "function not implemented yet!")
-            elif node.op == 'call_module':
-                if any(n in node.name for n in ['layernorm', 'norm']):
+                    raise NotImplementedError(
+                        node.name, "function not implemented yet!"
+                    )
+            elif node.op == "call_module":
+                if any(n in node.name for n in ["layernorm", "norm"]):
                     self._assign_layernorm_index(node, idx)
                 else:
                     raise NotImplementedError(node.name, "module not implemented yet!")
-            elif node.op == 'get_attr':
-                self._assign_all_index(node, idx) # get param
-            elif node.op == 'output':
+            elif node.op == "get_attr":
+                self._assign_all_index(node, idx)  # get param
+            elif node.op == "output":
                 continue
             else:
                 raise NotImplementedError(node.op, "op not implemented yet!")
         # self._merge_equal_idx()
-        
+
     def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node):
         """
         Check 2 given index: one index should be source of the other
@@ -742,8 +796,10 @@ def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node
         """
         start_node_idx = _find_idx_by_name(start_node.name, self.nodes_list)
         end_node_trace = self._find_trace_from_node(end_node)
-        end_node_trace_source = end_node_trace['source'][end_dim]
-        sorted_source = sorted(end_node_trace_source.items(), key=lambda d:d[0], reverse=True)
+        end_node_trace_source = end_node_trace["source"][end_dim]
+        sorted_source = sorted(
+            end_node_trace_source.items(), key=lambda d: d[0], reverse=True
+        )
         for node_idx, node_dim in sorted_source:
             if node_idx == start_node_idx and node_dim == start_dim:
                 return True
@@ -765,7 +821,7 @@ def check_index_compute(self, start_idx, end_dim, end_node, end_idx):
             bool: True if check pass
         """
         end_node_trace = self._find_trace_from_node(end_node)
-        end_node_compute = end_node_trace['compute'][end_dim]
+        end_node_compute = end_node_trace["compute"][end_dim]
         if any(start_idx <= i <= end_idx for i in end_node_compute):
             return False
         return True
@@ -784,19 +840,23 @@ def __init__(self) -> None:
         pass
 
     def _get_meta_node_size(self, x):
-        x = x.meta['tensor_meta']
+        x = x.meta["tensor_meta"]
         x = x.numel * torch.tensor([], dtype=x.dtype).element_size()
         return x
 
     def _get_output_node(self, n):
-        fwd_out = {x.uuid: x for x in n.meta["fwd_out"] if isinstance(x, torch.Tensor) and hasattr(x, 'uuid')}
+        fwd_out = {
+            x.uuid: x
+            for x in n.meta["fwd_out"]
+            if isinstance(x, torch.Tensor) and hasattr(x, "uuid")
+        }
         out_size = activation_size(fwd_out)
         out_node = [n.name] if out_size > 0 else []
         return out_size, out_node
-    
+
     def _get_output_node_size(self, n):
         return self._get_output_node(n)[0]
-    
+
     def _add_active_node(self, n, active_list):
         new_active = self._get_output_node(n)[1]
         for i in new_active:
@@ -806,7 +866,7 @@ def _add_active_node(self, n, active_list):
     def _get_delete_node(self, user, user_to_last_uses):
         delete_size = 0
         delete_node = []
-        if user.op not in ('placeholder', 'output'):
+        if user.op not in ("placeholder", "output"):
             nodes_to_delete = user_to_last_uses.get(user, [])
             if len(nodes_to_delete):
                 out_node = [self._get_output_node(i) for i in nodes_to_delete]
@@ -814,13 +874,13 @@ def _get_delete_node(self, user, user_to_last_uses):
                 for i in range(len(out_node)):
                     if out_node[i][0] > 0:
                         delete_node.append(out_node[i][1][0])
-                    elif nodes_to_delete[i].op == 'placeholder':
+                    elif nodes_to_delete[i].op == "placeholder":
                         delete_node.append(nodes_to_delete[i].name)
         return delete_size, delete_node
-    
+
     def _get_delete_node_size(self, user, user_to_last_uses):
         return self._get_delete_node(user, user_to_last_uses)[0]
-    
+
     def _remove_deactive_node(self, user, user_to_last_uses, active_list):
         delete_node = self._get_delete_node(user, user_to_last_uses)[1]
         for i in delete_node:
@@ -842,20 +902,24 @@ def register_last_uses(n: Node, user: Node):
 
     def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
         mem = 0
-        not_contiguous_ops = ['transpose', 'permute']
+        not_contiguous_ops = ["transpose", "permute"]
 
-        if node.op == 'call_function' and any(n in node.name for n in ['matmul', 'reshape']):
+        if node.op == "call_function" and any(
+            n in node.name for n in ["matmul", "reshape"]
+        ):
             for n in node.args:
                 if n in not_contiguous_list:
                     # matmul won't change origin tensor, but create a tmp copy
                     mem += self._get_output_node_size(n)
-        elif node.op == 'call_module':
+        elif node.op == "call_module":
             for n in node.args:
                 if n in not_contiguous_list:
                     # module will just make origin tensor to contiguous
                     if delete:
                         not_contiguous_list.remove(n)
-        elif node.op == 'call_method' and any(i in node.name for i in not_contiguous_ops):
+        elif node.op == "call_method" and any(
+            i in node.name for i in not_contiguous_ops
+        ):
             if node not in not_contiguous_list:
                 not_contiguous_list.append(node)
         elif any(i in node.args for i in not_contiguous_list):
@@ -865,13 +929,14 @@ def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
         return mem
 
     def _get_chunk_ratio(self, node, chunk_dim, chunk_size):
-        shape = node.meta['tensor_meta'].shape
+        shape = node.meta["tensor_meta"].shape
         chunk_ratio = float(chunk_size) / shape[chunk_dim]
         return chunk_ratio
 
-
-    def _get_chunk_delete_node_size(self, user, user_to_last_uses, chunk_ratio, node_list, start_node, end_node):
-        if user.op in ('placeholder', 'output'):
+    def _get_chunk_delete_node_size(
+        self, user, user_to_last_uses, chunk_ratio, node_list, start_node, end_node
+    ):
+        if user.op in ("placeholder", "output"):
             return 0
         nodes_to_delete = user_to_last_uses.get(user, [])
         delete_size = 0
@@ -881,12 +946,11 @@ def _get_chunk_delete_node_size(self, user, user_to_last_uses, chunk_ratio, node
                 delete_size += self._get_output_node_size(n) * chunk_ratio
         return delete_size
 
-
     def _print_mem_log(self, log, nodes, title=None):
         if title:
             print(title)
         for idx, (l, n) in enumerate(zip(log, nodes)):
-            print("%s:%.2f \t" % (n.name, l), end='')
+            print("%s:%.2f \t" % (n.name, l), end="")
             if (idx + 1) % 3 == 0:
                 print("")
         print("\n")
@@ -895,16 +959,23 @@ def _print_compute_op_mem_log(self, log, nodes, title=None):
         if title:
             print(title)
         for idx, (l, n) in enumerate(zip(log, nodes)):
-            if n.op in ['placeholder', 'get_attr', 'output']:
+            if n.op in ["placeholder", "get_attr", "output"]:
                 continue
-            if any(i in n.name for i in ['getitem', 'getattr']):
+            if any(i in n.name for i in ["getitem", "getattr"]):
                 continue
-            print("%s:%.2f \t" % (n.name, l), end='')
+            print("%s:%.2f \t" % (n.name, l), end="")
             if (idx + 1) % 3 == 0:
                 print("")
         print("\n")
-    
-    def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes=None, end_nodes=None, chunk_dims=None, chunk_sizes=None):
+
+    def estimate_chunk_inference_mem(
+        self,
+        gm: torch.fx.GraphModule,
+        start_nodes=None,
+        end_nodes=None,
+        chunk_dims=None,
+        chunk_sizes=None,
+    ):
         act_memory = 0.0
         act_memory_peak_log = []
         act_memory_after_node_log = []
@@ -915,42 +986,65 @@ def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes=Non
         user_to_last_uses = self._get_last_usr(node_list)
         user_to_last_uses_no_free_var = self._get_last_usr(node_list)
         _delete_free_var_from_last_use(user_to_last_uses_no_free_var)
-        
-        use_chunk = all(i is not None for i in [start_nodes, end_nodes, chunk_dims, chunk_sizes])
+
+        use_chunk = all(
+            i is not None for i in [start_nodes, end_nodes, chunk_dims, chunk_sizes]
+        )
         chunk_within = False
         chunk_region_idx = 0
-        chunk_ratio = 1 # use it to estimate chunk mem
+        chunk_ratio = 1  # use it to estimate chunk mem
 
         for idx, node in enumerate(node_list):
             # if node in chunk start nodes, change chunk ratio and add chunk_tensor
             if use_chunk and idx in start_nodes:
                 chunk_within = True
-                chunk_ratio = self._get_chunk_ratio(node, chunk_dims[chunk_region_idx], chunk_sizes[chunk_region_idx])
-                act_memory += self._get_output_node_size(node_list[end_nodes[chunk_region_idx]]) / (1024 ** 2)
-                
+                chunk_ratio = self._get_chunk_ratio(
+                    node, chunk_dims[chunk_region_idx], chunk_sizes[chunk_region_idx]
+                )
+                act_memory += self._get_output_node_size(
+                    node_list[end_nodes[chunk_region_idx]]
+                ) / (1024**2)
+
             # if node is placeholder, just add the size of the node
-            if node.op == 'placeholder':
-                act_memory += self._get_meta_node_size(node) * chunk_ratio / (1024 ** 2)
+            if node.op == "placeholder":
+                act_memory += self._get_meta_node_size(node) * chunk_ratio / (1024**2)
                 act_memory_peak_log.append(act_memory)
                 active_node_list.append(node.name)
             # skip output
-            elif node.op == 'output':
+            elif node.op == "output":
                 continue
             # node is an operation, calculate tmp, output node and delete node memory
             else:
                 # forward memory
-                act_memory += self._get_contiguous_memory(node, not_contiguous_list) * chunk_ratio / (1024 ** 2)
-                act_memory += self._get_output_node_size(node) * chunk_ratio / (1024 ** 2)
+                act_memory += (
+                    self._get_contiguous_memory(node, not_contiguous_list)
+                    * chunk_ratio
+                    / (1024**2)
+                )
+                act_memory += (
+                    self._get_output_node_size(node) * chunk_ratio / (1024**2)
+                )
                 # record max act memory
                 act_memory_peak_log.append(act_memory)
                 # delete useless memory
-                act_memory -= self._get_contiguous_memory(node, not_contiguous_list, delete=True) * chunk_ratio / (1024 ** 2)
+                act_memory -= (
+                    self._get_contiguous_memory(node, not_contiguous_list, delete=True)
+                    * chunk_ratio
+                    / (1024**2)
+                )
                 if chunk_within:
                     act_memory -= self._get_chunk_delete_node_size(
-                        node, user_to_last_uses_no_free_var, chunk_ratio, node_list, 
-                        start_nodes[chunk_region_idx], end_nodes[chunk_region_idx]) / (1024 ** 2)
+                        node,
+                        user_to_last_uses_no_free_var,
+                        chunk_ratio,
+                        node_list,
+                        start_nodes[chunk_region_idx],
+                        end_nodes[chunk_region_idx],
+                    ) / (1024**2)
                 else:
-                    act_memory -= self._get_delete_node_size(node, user_to_last_uses_no_free_var) / (1024 ** 2)
+                    act_memory -= self._get_delete_node_size(
+                        node, user_to_last_uses_no_free_var
+                    ) / (1024**2)
 
             # log active node
             self._add_active_node(node, active_node_list)
@@ -958,11 +1052,13 @@ def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes=Non
 
             # if node in chunk end nodes, restore chunk settings
             if use_chunk and idx in end_nodes:
-                act_memory -= self._get_output_node_size(node) * chunk_ratio / (1024 ** 2)
+                act_memory -= (
+                    self._get_output_node_size(node) * chunk_ratio / (1024**2)
+                )
                 chunk_within = False
                 chunk_ratio = 1
                 chunk_region_idx += 1
-            
+
             act_memory_after_node_log.append(act_memory)
             active_node_list_log.append(copy.deepcopy(active_node_list))
 
@@ -991,14 +1087,14 @@ def _find_peak_node(self, mem_peak):
         max_value = max(mem_peak)
         max_idx = mem_peak.index(max_value)
         return max_idx
-    
+
     def _get_free_var(self):
         free_var_idx = []
         for idx, n in enumerate(self.node_list):
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 free_var_idx.append(idx)
         return free_var_idx
-    
+
     def _get_min_free_var(self, active_node_list, free_vars):
         min_len = 999
         for idx, n in enumerate(active_node_list):
@@ -1007,11 +1103,11 @@ def _get_min_free_var(self, active_node_list, free_vars):
             if len(n) < min_len:
                 min_len = len(n)
         return min_len
-    
+
     def _search_max_chunk_region(self, active_node, peak_node):
         free_vars = self._get_free_var()
         min_var = self._get_min_free_var(active_node, free_vars)
-        
+
         # from peak_node to free_var
         chunk_region_start = None
         for i in range(peak_node, -1, -1):
@@ -1029,17 +1125,19 @@ def _search_max_chunk_region(self, active_node, peak_node):
             if i in free_vars or i == 0:
                 raise RuntimeError()
         return chunk_region_start, chunk_region_end
-    
+
     def _is_not_compute(self, trace, chunk_range, dim_idx):
-        if trace['idx'][dim_idx] not in trace['compute']:
+        if trace["idx"][dim_idx] not in trace["compute"]:
             return True
-        if trace['idx'][dim_idx] in trace['compute'] and \
-            all(i < chunk_range[0] or i > chunk_range[1] for i in trace['compute'][trace['idx'][dim_idx]]):
+        if trace["idx"][dim_idx] in trace["compute"] and all(
+            i < chunk_range[0] or i > chunk_range[1]
+            for i in trace["compute"][trace["idx"][dim_idx]]
+        ):
             return True
         return False
-    
+
     def _check_duplicate_map(self, chunk_infos):
-        dim_map = [(i['inputs_dim'], i['outputs_dim']) for i in chunk_infos]
+        dim_map = [(i["inputs_dim"], i["outputs_dim"]) for i in chunk_infos]
         remove_list = []
         for idx1, (input_dim1, output_dim1) in enumerate(dim_map):
             for idx2, (input_dim2, output_dim2) in enumerate(dim_map):
@@ -1055,36 +1153,41 @@ def _check_duplicate_map(self, chunk_infos):
             if i in chunk_infos:
                 chunk_infos.remove(i)
         return chunk_infos
-    
+
     def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
         start_traces = input_trace[start_idx]
         end_trace = output_trace[end_idx]
         end_node = self.node_list[end_idx]
         chunk_infos = []
-        for end_dim, end_trace_idx in enumerate(end_trace['idx']):
+        for end_dim, end_trace_idx in enumerate(end_trace["idx"]):
             if len(start_traces) > 1:
                 # TODO implement multi input chunk
                 continue
             for start_node, start_trace in start_traces.items():
-                for start_dim, start_trace_idx in enumerate(start_trace['idx']):
+                for start_dim, start_trace_idx in enumerate(start_trace["idx"]):
                     # must be same trace idx
                     if start_trace_idx != end_trace_idx:
                         continue
                     # dim size cannot be 1
-                    if _get_node_shape(end_node)[end_dim] == 1 or \
-                        _get_node_shape(start_node)[start_dim] == 1:
+                    if (
+                        _get_node_shape(end_node)[end_dim] == 1
+                        or _get_node_shape(start_node)[start_dim] == 1
+                    ):
                         continue
                     # check index source align
                     if not self.index_tracer.check_index_source(
-                        start_dim, start_node, start_idx, end_dim, end_node):
+                        start_dim, start_node, start_idx, end_dim, end_node
+                    ):
                         continue
                     # check index copmute
                     if not self.index_tracer.check_index_compute(
-                        start_idx, end_dim, end_node, end_idx):
+                        start_idx, end_dim, end_node, end_idx
+                    ):
                         continue
                     # detect flow meet
                     flow_flag, chunk_info = self.flow_tracer._detect_flow(
-                        start_idx, start_dim, end_idx, end_dim)
+                        start_idx, start_dim, end_idx, end_dim
+                    )
                     if flow_flag:
                         continue
                     chunk_infos.append(chunk_info)
@@ -1098,59 +1201,78 @@ def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
         for _, n in enumerate(self.node_list):
             cur_trace = {}
             for arg in n.args:
-                if type(arg) == type(n) and not _is_non_compute_node_except_placeholder(arg):
+                if type(arg) == type(n) and not _is_non_compute_node_except_placeholder(
+                    arg
+                ):
                     cur_trace[arg] = self.index_tracer._find_trace_from_node(arg)
             input_trace.append(cur_trace)
 
         for start_idx in range(max_chunk_region[0], peak_node + 1):
             for end_idx in range(peak_node, max_chunk_region[1] + 1):
                 # skip non compute nodes
-                if _is_non_compute_node(self.node_list[start_idx]) or \
-                    _is_non_compute_node(self.node_list[end_idx]):
+                if _is_non_compute_node(
+                    self.node_list[start_idx]
+                ) or _is_non_compute_node(self.node_list[end_idx]):
                     continue
-                
+
                 # select free dim
-                chunk_info = self._find_free_dim(input_trace, output_trace, start_idx, end_idx)
+                chunk_info = self._find_free_dim(
+                    input_trace, output_trace, start_idx, end_idx
+                )
                 if len(chunk_info) > 0:
                     possible_chunk_region.extend(chunk_info)
         return possible_chunk_region
-    
+
     def _search_best_chunk_region(self, possible_chunk_regions):
         max_region_range = 0
         best_regions = None
         for i in possible_chunk_regions:
-            if i['region'][1] - i['region'][0] > max_region_range:
+            if i["region"][1] - i["region"][0] > max_region_range:
                 best_regions = i
-                max_region_range = i['region'][1] - i['region'][0]
+                max_region_range = i["region"][1] - i["region"][0]
         return best_regions
-    
+
     def _step_search(self, mem_peak, active_node):
         peak_node = self._find_peak_node(mem_peak)
         max_chunk_region = self._search_max_chunk_region(active_node, peak_node)
-        possible_chunk_regions = self._search_possible_chunk_regions(max_chunk_region, peak_node)
+        possible_chunk_regions = self._search_possible_chunk_regions(
+            max_chunk_region, peak_node
+        )
         best_chunk_region = self._search_best_chunk_region(possible_chunk_regions)
         return best_chunk_region
-    
+
     def _stop_search(self, init_mem_peak, mem_peak):
         sorted_init_mem_peak = sorted(init_mem_peak)
         if max(mem_peak) < sorted_init_mem_peak[int(len(sorted_init_mem_peak) * 0.5)]:
             return True
         return False
-    
+
     def search_region(self):
         chunk_regions = []
-        init_mem_peak, _, active_node = self.memory_estimator.estimate_chunk_inference_mem(self.gm)
+        (
+            init_mem_peak,
+            _,
+            active_node,
+        ) = self.memory_estimator.estimate_chunk_inference_mem(self.gm)
         mem_peak = init_mem_peak
-        
+
         while True:
             chunk_region = self._step_search(mem_peak, active_node)
             if chunk_region is None:
                 break
-            
+
             chunk_regions.append(chunk_region)
-            mem_peak, _, active_node = self.memory_estimator.estimate_chunk_inference_mem(
-                self.gm, [i['region'][0] for i in chunk_regions], 
-                [i['region'][1] for i in chunk_regions], [i['inputs_dim'] for i in chunk_regions], [1] * len(chunk_regions))
+            (
+                mem_peak,
+                _,
+                active_node,
+            ) = self.memory_estimator.estimate_chunk_inference_mem(
+                self.gm,
+                [i["region"][0] for i in chunk_regions],
+                [i["region"][1] for i in chunk_regions],
+                [i["inputs_dim"] for i in chunk_regions],
+                [1] * len(chunk_regions),
+            )
             if self._stop_search(init_mem_peak, mem_peak):
                 break
         return chunk_regions
@@ -1180,18 +1302,24 @@ def _get_first_non_single_dim(shape):
 def _gen_loop_start(chunk_input_meta, chunk_output, chunk_dim, chunk_size=2):
     if len(chunk_input_meta) == 1:
         node = chunk_input_meta[0]
-        node_shape = node.meta['tensor_meta'].shape
-        free_shape = [node_shape[i] if i in chunk_dim else 1 for i in range(len(node_shape))]
+        node_shape = node.meta["tensor_meta"].shape
+        free_shape = [
+            node_shape[i] if i in chunk_dim else 1 for i in range(len(node_shape))
+        ]
         chunk_dim = _get_first_non_single_dim(free_shape)
         chunk_slice = _gen_chunk_slice_dim(chunk_dim, "gen_chunk_idx", node_shape)
-        out_shape = str(list(chunk_output.meta['tensor_meta'].shape))
-        
-        context = "chunk_result = torch.empty(%s, dtype=%s.dtype, device=%s.device); chunk_size = %d\nfor gen_chunk_idx in range" % (
-            out_shape, node.name, node.name, chunk_size)
+        out_shape = str(list(chunk_output.meta["tensor_meta"].shape))
+
+        context = (
+            "chunk_result = torch.empty(%s, dtype=%s.dtype, device=%s.device); chunk_size = %d\nfor gen_chunk_idx in range"
+            % (out_shape, node.name, node.name, chunk_size)
+        )
         context += "(0, %s.shape[%d], chunk_size):\n" % (node.name, chunk_dim)
         context += "    chunk_tensor = %s%s\n" % (node.name, chunk_slice)
     else:
-        raise NotImplementedError("input with size %d not implemented" % len(chunk_input_meta))
+        raise NotImplementedError(
+            "input with size %d not implemented" % len(chunk_input_meta)
+        )
     return context
 
 
@@ -1199,17 +1327,27 @@ def _gen_loop_end(chunk_outputs, chunk_inputs, node_list, chunk_dim):
     chunk_inputs_name = chunk_inputs[0].name
     chunk_outputs_name = chunk_outputs.name
     chunk_outputs_idx = _find_idx_by_name(chunk_outputs_name, node_list)
-    chunk_output_shape = chunk_outputs.meta['tensor_meta'].shape
-    free_shape = [chunk_output_shape[i] if i in chunk_dim else 1 for i in range(len(chunk_output_shape))]
+    chunk_output_shape = chunk_outputs.meta["tensor_meta"].shape
+    free_shape = [
+        chunk_output_shape[i] if i in chunk_dim else 1
+        for i in range(len(chunk_output_shape))
+    ]
     chunk_dim = _get_first_non_single_dim(free_shape)
     chunk_slice = _gen_chunk_slice_dim(chunk_dim, "gen_chunk_idx", chunk_output_shape)
     context = "    chunk_result%s = %s\n" % (chunk_slice, chunk_outputs_name)
 
-    context += chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None"
-    
+    context += (
+        chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None"
+    )
+
     # determine if its the last use for chunk input
     users_name = list(chunk_inputs[0].users.keys())
-    if all([_find_idx_by_name(user.name, node_list) <= chunk_outputs_idx for user in users_name]):
+    if all(
+        [
+            _find_idx_by_name(user.name, node_list) <= chunk_outputs_idx
+            for user in users_name
+        ]
+    ):
         context += ";  %s = None" % chunk_inputs_name
 
     context += "\n"
@@ -1255,8 +1393,11 @@ def _find_chunk_input_and_output_nodes(nodes: List[Node]):
     # we treat that input node as the input of the checkpoint function
     for node in nodes:
         for input_node in node._input_nodes.keys():
-            if input_node not in nodes and input_node not in input_nodes \
-                and not _is_non_compute_node_except_placeholder(input_node):
+            if (
+                input_node not in nodes
+                and input_node not in input_nodes
+                and not _is_non_compute_node_except_placeholder(input_node)
+            ):
                 input_nodes.append(input_node)
 
     # if a node has a user node which is not in the node list
@@ -1264,8 +1405,11 @@ def _find_chunk_input_and_output_nodes(nodes: List[Node]):
     # TODO it is unsafe to remove non compute node here
     for node in nodes:
         for output_node in node.users.keys():
-            if output_node not in nodes and node not in output_nodes \
-                and not _is_non_compute_node_except_placeholder(input_node):
+            if (
+                output_node not in nodes
+                and node not in output_nodes
+                and not _is_non_compute_node_except_placeholder(input_node)
+            ):
                 output_nodes.append(node)
 
     return input_nodes, output_nodes
@@ -1288,7 +1432,15 @@ def _replace_name(context, name_from, name_to):
     return context
 
 
-def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_value_func, meta_nodes, meta_graph):
+def emit_code_with_chunk(
+    body,
+    ckpt_func,
+    nodes,
+    emit_node_func,
+    delete_unused_value_func,
+    meta_nodes,
+    meta_graph,
+):
     """Emit code with nested activation checkpoint
     When we detect some of the node.activation_checkpoint is a List, we will use
     this function to emit the activation checkpoint codes.
@@ -1304,14 +1456,14 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     # find the offload regions
     chunk_region_search = ChunkRegionSearch(meta_graph)
     chunk_search = chunk_region_search.search_region()
-    chunk_regions = [i['region'] for i in chunk_search]
-    chunk_dims = [i['dim'] for i in chunk_search]
-    chunk_infos = [i['chunk_info'] for i in chunk_search]
-    
+    chunk_regions = [i["region"] for i in chunk_search]
+    chunk_dims = [i["dim"] for i in chunk_search]
+    chunk_infos = [i["chunk_info"] for i in chunk_search]
+
     chunk_starts = [item[0] for item in chunk_regions]
     chunk_ends = [item[1] for item in chunk_regions]
-    chunk_inputs = [[j['inputs'][0] for j in i] for i in chunk_infos]
-    chunk_outputs = [[j['outputs'][0] for j in i] for i in chunk_infos]
+    chunk_inputs = [[j["inputs"][0] for j in i] for i in chunk_infos]
+    chunk_outputs = [[j["outputs"][0] for j in i] for i in chunk_infos]
     within_chunk_region = False
 
     node_list = list(nodes)
@@ -1322,14 +1474,18 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
     #     inputs, outputs = _find_input_and_output_nodes(offload_node_list)
     #     chunk_inputs.append(inputs)
     #     chunk_outputs.append(outputs)
-    
-    chunk_inputs_idx = [[_find_idx_by_name(j.name, node_list) for j in i] for i in chunk_inputs]
-    chunk_outputs_idx = [[_find_idx_by_name(j.name, node_list) for j in i] for i in chunk_outputs]
+
+    chunk_inputs_idx = [
+        [_find_idx_by_name(j.name, node_list) for j in i] for i in chunk_inputs
+    ]
+    chunk_outputs_idx = [
+        [_find_idx_by_name(j.name, node_list) for j in i] for i in chunk_outputs
+    ]
     chunk_inputs_names = []
     for i in chunk_inputs:
         for j in i:
             chunk_inputs_names.append(j.name)
-    
+
     # this flag is to prevent repeated insert of save tensors
     # hooks definition in ckpt_func
     node_idx = 0
@@ -1340,16 +1496,24 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
         if node_idx in chunk_starts:
             within_chunk_region = True
             region_idx = chunk_starts.index(node_idx)
-                
+
             # add for loop
             chunk_input_meta = [meta_nodes[i] for i in chunk_inputs_idx[region_idx]]
-            body.append(_gen_loop_start(chunk_input_meta, node_list[chunk_ends[region_idx]], chunk_dims[region_idx]))
+            body.append(
+                _gen_loop_start(
+                    chunk_input_meta,
+                    node_list[chunk_ends[region_idx]],
+                    chunk_dims[region_idx],
+                )
+            )
 
         if within_chunk_region:
             emit_node_func(node, body)
             # replace input var with chunk var
-            body[-1] = _replace_name(body[-1], chunk_inputs[region_idx][0].name, 'chunk_tensor')
-            body[-1] = '    ' + body[-1]
+            body[-1] = _replace_name(
+                body[-1], chunk_inputs[region_idx][0].name, "chunk_tensor"
+            )
+            body[-1] = "    " + body[-1]
             delete_unused_value_func(node, body, chunk_inputs_names)
 
         else:
@@ -1358,7 +1522,11 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
                 delete_unused_value_func(node, body, chunk_inputs_names)
 
         if node_idx in chunk_ends:
-            body.append(_gen_loop_end(node, chunk_inputs[region_idx], node_list, chunk_dims[region_idx]))
+            body.append(
+                _gen_loop_end(
+                    node, chunk_inputs[region_idx], node_list, chunk_dims[region_idx]
+                )
+            )
             within_chunk_region = False
 
         node_idx += 1
@@ -1372,14 +1540,16 @@ def __init__(self, meta_graph):
             self.meta_graph = meta_graph
             self.meta_node = list(meta_graph.graph.nodes)
 
-        def _gen_python_code(self, nodes, root_module: str, namespace: _Namespace) -> PythonCode:
+        def _gen_python_code(
+            self, nodes, root_module: str, namespace: _Namespace
+        ) -> PythonCode:
             free_vars: List[str] = []
             body: List[str] = []
             globals_: Dict[str, Any] = {}
             wrapped_fns: Dict[str, None] = {}
 
             # Wrap string in list to pass by reference
-            maybe_return_annotation: List[str] = ['']
+            maybe_return_annotation: List[str] = [""]
 
             def add_global(name_hint: str, obj: Any):
                 """Add an obj to be tracked as a global.
@@ -1389,7 +1559,9 @@ def add_global(name_hint: str, obj: Any):
 
                 Returns: the global name that should be used to reference 'obj' in generated source.
                 """
-                if _is_from_torch(obj) and obj != torch.device:    # to support registering torch.device
+                if (
+                    _is_from_torch(obj) and obj != torch.device
+                ):  # to support registering torch.device
                     # HACK: workaround for how torch custom ops are registered. We
                     # can't import them like normal modules so they must retain their
                     # fully qualified name.
@@ -1405,7 +1577,9 @@ def add_global(name_hint: str, obj: Any):
                 return global_name
 
             # set _custom_builtins here so that we needn't import colossalai in forward
-            _custom_builtins["colossalai"] = _CustomBuiltin("import colossalai", colossalai)
+            _custom_builtins["colossalai"] = _CustomBuiltin(
+                "import colossalai", colossalai
+            )
 
             # Pre-fill the globals table with registered builtins.
             for name, (_, obj) in _custom_builtins.items():
@@ -1414,16 +1588,16 @@ def add_global(name_hint: str, obj: Any):
             def type_repr(o: Any):
                 if o == ():
                     # Empty tuple is used for empty tuple type annotation Tuple[()]
-                    return '()'
+                    return "()"
 
                 typename = _type_repr(o)
 
-                if hasattr(o, '__origin__'):
+                if hasattr(o, "__origin__"):
                     # This is a generic type, e.g. typing.List[torch.Tensor]
                     origin_type = _origin_type_map.get(o.__origin__, o.__origin__)
                     origin_typename = add_global(_type_repr(origin_type), origin_type)
 
-                    if hasattr(o, '__args__'):
+                    if hasattr(o, "__args__"):
                         # Assign global names for each of the inner type variables.
                         args = [type_repr(arg) for arg in o.__args__]
 
@@ -1441,20 +1615,21 @@ def type_repr(o: Any):
                 # Common case: this is a regular module name like 'foo.bar.baz'
                 return add_global(typename, o)
 
-            def _format_args(args: Tuple[Argument, ...], kwargs: Dict[str, Argument]) -> str:
-
+            def _format_args(
+                args: Tuple[Argument, ...], kwargs: Dict[str, Argument]
+            ) -> str:
                 def _get_repr(arg):
                     # Handle NamedTuples (if it has `_fields`) via add_global.
-                    if isinstance(arg, tuple) and hasattr(arg, '_fields'):
+                    if isinstance(arg, tuple) and hasattr(arg, "_fields"):
                         qualified_name = _get_qualified_name(type(arg))
                         global_name = add_global(qualified_name, type(arg))
                         return f"{global_name}{repr(tuple(arg))}"
                     return repr(arg)
 
-                args_s = ', '.join(_get_repr(a) for a in args)
-                kwargs_s = ', '.join(f'{k} = {_get_repr(v)}' for k, v in kwargs.items())
+                args_s = ", ".join(_get_repr(a) for a in args)
+                kwargs_s = ", ".join(f"{k} = {_get_repr(v)}" for k, v in kwargs.items())
                 if args_s and kwargs_s:
-                    return f'{args_s}, {kwargs_s}'
+                    return f"{args_s}, {kwargs_s}"
                 return args_s or kwargs_s
 
             # Run through reverse nodes and record the first instance of a use
@@ -1472,9 +1647,9 @@ def register_last_uses(n: Node, user: Node):
             for node in reversed(nodes):
                 map_arg(node.args, lambda n: register_last_uses(n, node))
                 map_arg(node.kwargs, lambda n: register_last_uses(n, node))
-            
+
             _delete_free_var_from_last_use(user_to_last_uses)
-            
+
             # NOTE: we add a variable to distinguish body and ckpt_func
             def delete_unused_values(user: Node, body, to_keep=[]):
                 """
@@ -1482,103 +1657,140 @@ def delete_unused_values(user: Node, body, to_keep=[]):
                 not used in the remainder of the code are freed and the memory usage
                 of the code is optimal.
                 """
-                if user.op == 'placeholder':
+                if user.op == "placeholder":
                     return
-                if user.op == 'output':
-                    body.append('\n')
+                if user.op == "output":
+                    body.append("\n")
                     return
                 nodes_to_delete = user_to_last_uses.get(user, [])
                 nodes_to_delete = [i for i in nodes_to_delete if i.name not in to_keep]
                 if len(nodes_to_delete):
-                    to_delete_str = ' = '.join([repr(n) for n in nodes_to_delete] + ['None'])
-                    body.append(f';  {to_delete_str}\n')
+                    to_delete_str = " = ".join(
+                        [repr(n) for n in nodes_to_delete] + ["None"]
+                    )
+                    body.append(f";  {to_delete_str}\n")
                 else:
-                    body.append('\n')
+                    body.append("\n")
 
             # NOTE: we add a variable to distinguish body and ckpt_func
             def emit_node(node: Node, body):
-                maybe_type_annotation = '' if node.type is None else f' : {type_repr(node.type)}'
-                if node.op == 'placeholder':
+                maybe_type_annotation = (
+                    "" if node.type is None else f" : {type_repr(node.type)}"
+                )
+                if node.op == "placeholder":
                     assert isinstance(node.target, str)
-                    maybe_default_arg = '' if not node.args else f' = {repr(node.args[0])}'
-                    free_vars.append(f'{node.target}{maybe_type_annotation}{maybe_default_arg}')
-                    raw_name = node.target.replace('*', '')
+                    maybe_default_arg = (
+                        "" if not node.args else f" = {repr(node.args[0])}"
+                    )
+                    free_vars.append(
+                        f"{node.target}{maybe_type_annotation}{maybe_default_arg}"
+                    )
+                    raw_name = node.target.replace("*", "")
                     if raw_name != repr(node):
-                        body.append(f'{repr(node)} = {raw_name}\n')
+                        body.append(f"{repr(node)} = {raw_name}\n")
                     return
-                elif node.op == 'call_method':
+                elif node.op == "call_method":
                     assert isinstance(node.target, str)
                     body.append(
-                        f'{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}'
-                        f'({_format_args(node.args[1:], node.kwargs)})')
+                        f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}"
+                        f"({_format_args(node.args[1:], node.kwargs)})"
+                    )
                     return
-                elif node.op == 'call_function':
+                elif node.op == "call_function":
                     assert callable(node.target)
                     # pretty print operators
-                    if node.target.__module__ == '_operator' and node.target.__name__ in magic_methods:
+                    if (
+                        node.target.__module__ == "_operator"
+                        and node.target.__name__ in magic_methods
+                    ):
                         assert isinstance(node.args, tuple)
-                        body.append(f'{repr(node)}{maybe_type_annotation} = '
-                                    f'{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}')
+                        body.append(
+                            f"{repr(node)}{maybe_type_annotation} = "
+                            f"{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}"
+                        )
                         return
 
                     # pretty print inplace operators; required for jit.script to work properly
                     # not currently supported in normal FX graphs, but generated by torchdynamo
-                    if node.target.__module__ == '_operator' and node.target.__name__ in inplace_methods:
-                        body.append(f'{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  '
-                                    f'{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}')
+                    if (
+                        node.target.__module__ == "_operator"
+                        and node.target.__name__ in inplace_methods
+                    ):
+                        body.append(
+                            f"{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  "
+                            f"{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}"
+                        )
                         return
 
                     qualified_name = _get_qualified_name(node.target)
                     global_name = add_global(qualified_name, node.target)
                     # special case for getattr: node.args could be 2-argument or 3-argument
                     # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
-                    if global_name == 'getattr' and \
-                    isinstance(node.args, tuple) and \
-                    isinstance(node.args[1], str) and \
-                    node.args[1].isidentifier() and \
-                    len(node.args) == 2:
+                    if (
+                        global_name == "getattr"
+                        and isinstance(node.args, tuple)
+                        and isinstance(node.args[1], str)
+                        and node.args[1].isidentifier()
+                        and len(node.args) == 2
+                    ):
                         body.append(
-                            f'{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}')
+                            f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}"
+                        )
                         return
                     body.append(
-                        f'{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})')
-                    if node.meta.get('is_wrapped', False):
+                        f"{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})"
+                    )
+                    if node.meta.get("is_wrapped", False):
                         wrapped_fns.setdefault(global_name)
                     return
-                elif node.op == 'call_module':
+                elif node.op == "call_module":
                     assert isinstance(node.target, str)
-                    body.append(f'{repr(node)}{maybe_type_annotation} = '
-                                f'{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})')
+                    body.append(
+                        f"{repr(node)}{maybe_type_annotation} = "
+                        f"{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})"
+                    )
                     return
-                elif node.op == 'get_attr':
+                elif node.op == "get_attr":
                     assert isinstance(node.target, str)
-                    body.append(f'{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}')
+                    body.append(
+                        f"{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}"
+                    )
                     return
-                elif node.op == 'output':
+                elif node.op == "output":
                     if node.type is not None:
                         maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
                     body.append(self.generate_output(node.args[0]))
                     return
-                raise NotImplementedError(f'node: {node.op} {node.target}')
+                raise NotImplementedError(f"node: {node.op} {node.target}")
 
             # Modified for activation checkpointing
             ckpt_func = []
 
             # if any node has a list of labels for activation_checkpoint, we
             # will use nested type of activation checkpoint codegen
-            emit_code_with_chunk(body, ckpt_func, nodes, emit_node, delete_unused_values, self.meta_node, self.meta_graph)
+            emit_code_with_chunk(
+                body,
+                ckpt_func,
+                nodes,
+                emit_node,
+                delete_unused_values,
+                self.meta_node,
+                self.meta_graph,
+            )
 
             if len(body) == 0:
                 # If the Graph has no non-placeholder nodes, no lines for the body
                 # have been emitted. To continue to have valid Python code, emit a
                 # single pass statement
-                body.append('pass\n')
+                body.append("pass\n")
 
             if len(wrapped_fns) > 0:
-                wrap_name = add_global('wrap', torch.fx.wrap)
-                wrap_stmts = '\n'.join([f'{wrap_name}("{name}")' for name in wrapped_fns])
+                wrap_name = add_global("wrap", torch.fx.wrap)
+                wrap_stmts = "\n".join(
+                    [f'{wrap_name}("{name}")' for name in wrapped_fns]
+                )
             else:
-                wrap_stmts = ''
+                wrap_stmts = ""
 
             if self._body_transformer:
                 body = self._body_transformer(body)
@@ -1589,15 +1801,15 @@ def emit_node(node: Node, body):
             # as we need colossalai.utils.checkpoint, we need to import colossalai
             # in forward function
             prologue = self.gen_fn_def(free_vars, maybe_return_annotation[0])
-            prologue = ''.join(ckpt_func) + prologue
+            prologue = "".join(ckpt_func) + prologue
             prologue = prologue
 
-            code = ''.join(body)
-            code = '\n'.join('    ' + line for line in code.split('\n'))
+            code = "".join(body)
+            code = "\n".join("    " + line for line in code.split("\n"))
             fn_code = f"""
 {wrap_stmts}
 
 {prologue}
-{code}"""   
+{code}"""
             print(fn_code)
             return PythonCode(fn_code, globals_)

From 5de9e46381f35a40ffff3675c2170a987b6fd9b9 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Sat, 10 Dec 2022 17:34:48 +0800
Subject: [PATCH 030/209] code format

---
 chunk_codegen.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index e8cf0d22f157..9147aa9fcc20 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -49,7 +49,9 @@ def _is_non_compute_node(node):
 
 
 def _is_non_compute_node_except_placeholder(node):
-    if (any(i in node.op for i in ["get_attr", "output"]) or any(i in node.name for i in ["getitem", "getattr"])):
+    if any(i in node.op for i in ["get_attr", "output"]) or any(
+        i in node.name for i in ["getitem", "getattr"]
+    ):
         return True
     return False
 

From 31a2c5d09fb5496c90f740b3e7cac787ef489e91 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 12 Dec 2022 17:24:06 +0800
Subject: [PATCH 031/209] work with outerproductmean and msa

---
 chunk_codegen.py | 258 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 168 insertions(+), 90 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 9147aa9fcc20..191eab564853 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -134,7 +134,7 @@ def find_node_flow(self, node):
                     return name, i
         raise RuntimeError("invalid node")
 
-    def get_flow_mix(self, node):
+    def _get_flow_mix_node(self, node):
         if self._is_non_compute_node(node):
             return None
         _, node_trace = self.find_node_flow(node)
@@ -145,7 +145,7 @@ def get_flow_mix(self, node):
         vars = list(node_trace["outside_depend"][0].values())[0]
         return vars
 
-    def get_same_flow_node(self, node_list, node):
+    def _get_same_flow_node(self, node_list, node):
         name, _ = self.find_node_flow(node)
         result = []
         for i in self.flow_trace[name]:
@@ -181,13 +181,14 @@ def trace_flow(self):
                     )
         return self.flow_trace
 
-    def _detect_flow(self, start_idx, start_dim, end_idx, end_dim):
-        inputs, outputs = _find_chunk_input_and_output_nodes(
+    def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
+        inputs, outputs = _find_chunk_compute_input_and_output_nodes(
             self.node_list[start_idx : end_idx + 1]
         )
         chunk_info = {
             "region": (start_idx, end_idx),
             "inputs": inputs,
+            "inputs_non_chunk": [],
             "inputs_dim": start_dim,
             "outputs": outputs,
             "outputs_dim": end_dim,
@@ -197,31 +198,71 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim):
 
         for idx in range(start_idx, end_idx + 1):
             node = self.node_list[idx]
-            mix_flow_var = self.get_flow_mix(node)
-            if mix_flow_var is None:
+            mix_flow_node = self._get_flow_mix_node(node)
+            if mix_flow_node is None:
                 continue
 
-            # if there is a flow mix, op must be in [mul, add, div, matmul]
+            # if there is a flow mix, op must be in [mul, add, matmul]
             # element-wise op requires dim to be equal in every dim
             if any(n in node.name for n in ["mul", "add"]):
                 for i in node.args:
-                    if type(i) == type(mix_flow_var) and i != mix_flow_var:
+                    if type(i) == type(mix_flow_node) and i != mix_flow_node:
                         main_flow_var = i
                 # if mix flow is a broadcast in chunk dim,
                 # TODO need to move that flow out of the chunk
-                if mix_flow_var.meta["tensor_meta"].shape[dim_idx] == 1:
+                mix_flow_node_dim = index_tracer._get_node_chunk_dim(
+                    self.node_list[end_idx], end_dim, node
+                )
+                if mix_flow_node_dim is None:
                     flow_flag = True
-                    for i in self.get_same_flow_node(
-                        chunk_info["inputs"], mix_flow_var
+                    break
+                if _get_node_shape(mix_flow_node)[mix_flow_node_dim] == 1:
+                    flow_flag = False
+                    for i in self._get_same_flow_node(
+                        chunk_info["inputs"], mix_flow_node
                     ):
                         chunk_info["inputs"].remove(i)
                 # else, we need to chunk mix var as well
                 else:
                     # TODO chunk another value
-                    flow_flag = False
+                    flow_flag = True
                     break
             else:
                 raise NotImplementedError("%s not implemented" % node.name)
+        
+        inputs_dim = []
+        remove_inputs = []
+        for input_node in chunk_info['inputs']:
+            input_dict = {}
+            for user in input_node.users.keys():
+                if _is_non_compute_node(user):
+                    continue
+                user_idx = _find_idx_by_name(user.name, self.node_list)
+                dim = None
+                if start_dim <= user_idx < end_idx:
+                    dim = index_tracer._get_node_chunk_dim(
+                        self.node_list[end_idx], end_dim, input_node
+                    )
+                elif user_idx == end_idx:
+                    dim = end_dim
+                # n has relation with chunk dim
+                if dim is not None and _get_node_shape(user)[dim] != 1:
+                    input_dict[user_idx] = dim
+            if len(input_dict) == 0:
+                remove_inputs.append(input_node)
+            else:
+                inputs_dim.append(input_dict)
+        chunk_info['inputs_dim'] = inputs_dim
+        for i in remove_inputs:
+            if i in chunk_info['inputs']:
+                chunk_info['inputs'].remove(i)
+        
+        # we need to log input nodes to avoid deleteing them in the loop
+        non_chunk_inputs = _find_chunk_all_input_nodes(self.node_list[start_idx : end_idx + 1])
+        for i in non_chunk_inputs:
+            if i not in chunk_info['inputs']:
+                chunk_info["inputs_non_chunk"].append(i)
+
         return flow_flag, chunk_info
 
 
@@ -367,6 +408,20 @@ def _find_trace_from_node(self, node):
         node_dict = self.idx_trace_list[node_idx]
         return node_dict
 
+    def _find_source_trace_from_node(self, node):
+        """
+        Find node source trace by the node.
+
+        Args:
+            node (node)
+        Returns:
+            idx (list): idx of the node
+            compute (list): computed idx of the node.
+        """
+        node_idx = _find_idx_by_name(node.name, self.nodes_list)
+        node_dict = self.idx_trace_list[node_idx]
+        return node_dict["source"]
+
     def _find_idx_trace_from_node(self, node):
         """
         Find node idx trace by the node.
@@ -836,6 +891,15 @@ def check_index_compute(self, start_idx, end_dim, end_node, end_idx):
         #         return False
         # return True
 
+    def _get_node_chunk_dim(self, node_from, node_from_dim, node_to):
+        node_from_source = self._find_source_trace_from_node(node_from)
+        dim_source = node_from_source[node_from_dim]
+        node_to_idx = _find_idx_by_name(node_to.name, self.nodes_list)
+        for k, v in dim_source.items():
+            if k == node_to_idx:
+                return v
+        return None
+
 
 class MemoryEstimator(object):
     def __init__(self) -> None:
@@ -931,8 +995,10 @@ def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
         return mem
 
     def _get_chunk_ratio(self, node, chunk_dim, chunk_size):
+        sorted_dim = sorted(chunk_dim, key=lambda x: list(x.keys())[0])
+        dim = list(sorted_dim[-1].values())[0]
         shape = node.meta["tensor_meta"].shape
-        chunk_ratio = float(chunk_size) / shape[chunk_dim]
+        chunk_ratio = float(chunk_size) / shape[dim]
         return chunk_ratio
 
     def _get_chunk_delete_node_size(
@@ -1157,6 +1223,8 @@ def _check_duplicate_map(self, chunk_infos):
         return chunk_infos
 
     def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
+        if start_idx == 71 and end_idx == 126:
+            print(1)
         start_traces = input_trace[start_idx]
         end_trace = output_trace[end_idx]
         end_node = self.node_list[end_idx]
@@ -1188,7 +1256,7 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
                         continue
                     # detect flow meet
                     flow_flag, chunk_info = self.flow_tracer._detect_flow(
-                        start_idx, start_dim, end_idx, end_dim
+                        start_idx, start_dim, end_idx, end_dim, self.index_tracer
                     )
                     if flow_flag:
                         continue
@@ -1301,56 +1369,53 @@ def _get_first_non_single_dim(shape):
     raise RuntimeError("can not get first non single dim for shape", shape)
 
 
-def _gen_loop_start(chunk_input_meta, chunk_output, chunk_dim, chunk_size=2):
-    if len(chunk_input_meta) == 1:
-        node = chunk_input_meta[0]
-        node_shape = node.meta["tensor_meta"].shape
-        free_shape = [
-            node_shape[i] if i in chunk_dim else 1 for i in range(len(node_shape))
-        ]
-        chunk_dim = _get_first_non_single_dim(free_shape)
-        chunk_slice = _gen_chunk_slice_dim(chunk_dim, "gen_chunk_idx", node_shape)
-        out_shape = str(list(chunk_output.meta["tensor_meta"].shape))
-
-        context = (
-            "chunk_result = torch.empty(%s, dtype=%s.dtype, device=%s.device); chunk_size = %d\nfor gen_chunk_idx in range"
-            % (out_shape, node.name, node.name, chunk_size)
-        )
-        context += "(0, %s.shape[%d], chunk_size):\n" % (node.name, chunk_dim)
-        context += "    chunk_tensor = %s%s\n" % (node.name, chunk_slice)
-    else:
-        raise NotImplementedError(
-            "input with size %d not implemented" % len(chunk_input_meta)
-        )
+def _gen_loop_start(chunk_input, chunk_output, chunk_ouput_dim, chunk_size=2):
+    input_node = chunk_input[0]
+    
+    out_shape = _get_node_shape(chunk_output)
+    out_str = str(list(out_shape))
+
+    context = (
+        "chunk_result = torch.empty(%s, dtype=%s.dtype, device=%s.device); chunk_size = %d\nfor chunk_idx in range"
+        % (out_str, input_node.name, input_node.name, chunk_size)
+    )
+    context += "(0, %d, chunk_size):\n" % (out_shape[chunk_ouput_dim])
+
+    # node = chunk_input[0]
+    # node_shape = node.meta["tensor_meta"].shape
+    # free_shape = [
+    #     node_shape[i] if i in chunk_dim else 1 for i in range(len(node_shape))
+    # ]
+    # chunk_dim = _get_first_non_single_dim(free_shape)
+    # chunk_slice = _gen_chunk_slice_dim(chunk_dim, "gen_chunk_idx", node_shape)
+    # out_shape = str(list(chunk_output.meta["tensor_meta"].shape))
+
+    # context = (
+    #     "chunk_result = torch.empty(%s, dtype=%s.dtype, device=%s.device); chunk_size = %d\nfor gen_chunk_idx in range"
+    #     % (out_shape, node.name, node.name, chunk_size)
+    # )
+    # context += "(0, %s.shape[%d], chunk_size):\n" % (node.name, chunk_dim)
+    # context += "    chunk_tensor = %s%s\n" % (node.name, chunk_slice)
     return context
 
 
-def _gen_loop_end(chunk_outputs, chunk_inputs, node_list, chunk_dim):
-    chunk_inputs_name = chunk_inputs[0].name
+def _gen_loop_end(chunk_inputs, chunk_non_compute_inputs, chunk_outputs, chunk_outputs_dim, node_list):
     chunk_outputs_name = chunk_outputs.name
     chunk_outputs_idx = _find_idx_by_name(chunk_outputs_name, node_list)
     chunk_output_shape = chunk_outputs.meta["tensor_meta"].shape
-    free_shape = [
-        chunk_output_shape[i] if i in chunk_dim else 1
-        for i in range(len(chunk_output_shape))
-    ]
-    chunk_dim = _get_first_non_single_dim(free_shape)
-    chunk_slice = _gen_chunk_slice_dim(chunk_dim, "gen_chunk_idx", chunk_output_shape)
+    chunk_slice = _gen_chunk_slice_dim(chunk_outputs_dim, "chunk_idx", chunk_output_shape)
     context = "    chunk_result%s = %s\n" % (chunk_slice, chunk_outputs_name)
-
-    context += (
-        chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None"
-    )
+    context += (chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None")
 
     # determine if its the last use for chunk input
-    users_name = list(chunk_inputs[0].users.keys())
-    if all(
-        [
-            _find_idx_by_name(user.name, node_list) <= chunk_outputs_idx
-            for user in users_name
-        ]
-    ):
-        context += ";  %s = None" % chunk_inputs_name
+    for chunk_input in (chunk_inputs + chunk_non_compute_inputs):
+        if all(
+            [
+                _find_idx_by_name(user.name, node_list) <= chunk_outputs_idx
+                for user in chunk_input.users.keys()
+            ]
+        ):
+            context += ";  %s = None" % chunk_input.name
 
     context += "\n"
     return context
@@ -1382,7 +1447,24 @@ def _find_input_and_output_nodes(nodes: List[Node]):
     return input_nodes, output_nodes
 
 
-def _find_chunk_input_and_output_nodes(nodes: List[Node]):
+def _find_chunk_all_input_nodes(nodes: List[Node]):
+    """
+    Find non-compute input and output node names.
+    input nodes are nodes used in the list
+    output nodes are nodes will use nodes in the list
+    """
+    input_nodes = []
+    for node in nodes:
+        for input_node in node._input_nodes.keys():
+            if (
+                input_node not in nodes
+                and input_node not in input_nodes
+            ):
+                input_nodes.append(input_node)
+    return input_nodes
+
+
+def _find_chunk_compute_input_and_output_nodes(nodes: List[Node]):
     """
     Find non-compute input and output node names.
     input nodes are nodes used in the list
@@ -1410,7 +1492,7 @@ def _find_chunk_input_and_output_nodes(nodes: List[Node]):
             if (
                 output_node not in nodes
                 and node not in output_nodes
-                and not _is_non_compute_node_except_placeholder(input_node)
+                and not _is_non_compute_node_except_placeholder(output_node)
             ):
                 output_nodes.append(node)
 
@@ -1454,44 +1536,34 @@ def emit_code_with_chunk(
         emit_node_func: function to emit node
         delete_unused_value_func: function to remove the unused value
     """
+    node_list = list(nodes)
 
-    # find the offload regions
+    # find the chunk regions
     chunk_region_search = ChunkRegionSearch(meta_graph)
     chunk_search = chunk_region_search.search_region()
-    chunk_regions = [i["region"] for i in chunk_search]
-    chunk_dims = [i["dim"] for i in chunk_search]
-    chunk_infos = [i["chunk_info"] for i in chunk_search]
-
-    chunk_starts = [item[0] for item in chunk_regions]
-    chunk_ends = [item[1] for item in chunk_regions]
-    chunk_inputs = [[j["inputs"][0] for j in i] for i in chunk_infos]
-    chunk_outputs = [[j["outputs"][0] for j in i] for i in chunk_infos]
-    within_chunk_region = False
-
-    node_list = list(nodes)
 
-    # find the input and output var names for each offload region
-    # for idx, (start, end) in enumerate(chunk_regions):
-    #     offload_node_list = node_list[start:end + 1]
-    #     inputs, outputs = _find_input_and_output_nodes(offload_node_list)
-    #     chunk_inputs.append(inputs)
-    #     chunk_outputs.append(outputs)
+    chunk_regions = [i["region"] for i in chunk_search]
+    chunk_starts = [i[0] for i in chunk_regions]
+    chunk_ends = [i[1] for i in chunk_regions]
 
+    chunk_inputs = [i["inputs"] for i in chunk_search]
+    chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_search]
+    chunk_inputs_dim = [i["inputs_dim"] for i in chunk_search]
     chunk_inputs_idx = [
         [_find_idx_by_name(j.name, node_list) for j in i] for i in chunk_inputs
     ]
+    chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [j.name for i in chunk_inputs_non_chunk for j in i]
+
+    chunk_outputs = [i["outputs"][0] for i in chunk_search]
+    chunk_outputs_dim = [i["outputs_dim"] for i in chunk_search]
     chunk_outputs_idx = [
-        [_find_idx_by_name(j.name, node_list) for j in i] for i in chunk_outputs
+        _find_idx_by_name(i.name, node_list) for i in chunk_outputs
     ]
-    chunk_inputs_names = []
-    for i in chunk_inputs:
-        for j in i:
-            chunk_inputs_names.append(j.name)
 
-    # this flag is to prevent repeated insert of save tensors
-    # hooks definition in ckpt_func
     node_idx = 0
     region_idx = 0
+    within_chunk_region = False
+
     while node_idx < len(node_list):
         node = node_list[node_idx]
 
@@ -1500,21 +1572,24 @@ def emit_code_with_chunk(
             region_idx = chunk_starts.index(node_idx)
 
             # add for loop
-            chunk_input_meta = [meta_nodes[i] for i in chunk_inputs_idx[region_idx]]
             body.append(
                 _gen_loop_start(
-                    chunk_input_meta,
-                    node_list[chunk_ends[region_idx]],
-                    chunk_dims[region_idx],
+                    chunk_inputs[region_idx],
+                    chunk_outputs[region_idx],
+                    chunk_outputs_dim[region_idx],
                 )
             )
 
         if within_chunk_region:
             emit_node_func(node, body)
             # replace input var with chunk var
-            body[-1] = _replace_name(
-                body[-1], chunk_inputs[region_idx][0].name, "chunk_tensor"
-            )
+            for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
+                for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
+                    if idx == node_idx:
+                        chunk_slice = _gen_chunk_slice_dim(dim, "chunk_idx", _get_node_shape(input_node))    
+                        body[-1] = _replace_name(
+                            body[-1], input_node.name, input_node.name + chunk_slice
+                        )
             body[-1] = "    " + body[-1]
             delete_unused_value_func(node, body, chunk_inputs_names)
 
@@ -1526,7 +1601,10 @@ def emit_code_with_chunk(
         if node_idx in chunk_ends:
             body.append(
                 _gen_loop_end(
-                    node, chunk_inputs[region_idx], node_list, chunk_dims[region_idx]
+                    chunk_inputs[region_idx],
+                    chunk_inputs_non_chunk[region_idx],
+                    chunk_outputs[region_idx],
+                    chunk_outputs_dim[region_idx], node_list
                 )
             )
             within_chunk_region = False

From b7b67c32ad79c4e81775b32fc4a36ec733915f56 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 12 Dec 2022 17:25:38 +0800
Subject: [PATCH 032/209] code style

---
 chunk_codegen.py | 70 +++++++++++++++++++-----------------------------
 1 file changed, 28 insertions(+), 42 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 191eab564853..3bea84faeabb 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -229,10 +229,10 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
                     break
             else:
                 raise NotImplementedError("%s not implemented" % node.name)
-        
+
         inputs_dim = []
         remove_inputs = []
-        for input_node in chunk_info['inputs']:
+        for input_node in chunk_info["inputs"]:
             input_dict = {}
             for user in input_node.users.keys():
                 if _is_non_compute_node(user):
@@ -252,15 +252,17 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
                 remove_inputs.append(input_node)
             else:
                 inputs_dim.append(input_dict)
-        chunk_info['inputs_dim'] = inputs_dim
+        chunk_info["inputs_dim"] = inputs_dim
         for i in remove_inputs:
-            if i in chunk_info['inputs']:
-                chunk_info['inputs'].remove(i)
-        
+            if i in chunk_info["inputs"]:
+                chunk_info["inputs"].remove(i)
+
         # we need to log input nodes to avoid deleteing them in the loop
-        non_chunk_inputs = _find_chunk_all_input_nodes(self.node_list[start_idx : end_idx + 1])
+        non_chunk_inputs = _find_chunk_all_input_nodes(
+            self.node_list[start_idx : end_idx + 1]
+        )
         for i in non_chunk_inputs:
-            if i not in chunk_info['inputs']:
+            if i not in chunk_info["inputs"]:
                 chunk_info["inputs_non_chunk"].append(i)
 
         return flow_flag, chunk_info
@@ -1371,44 +1373,32 @@ def _get_first_non_single_dim(shape):
 
 def _gen_loop_start(chunk_input, chunk_output, chunk_ouput_dim, chunk_size=2):
     input_node = chunk_input[0]
-    
     out_shape = _get_node_shape(chunk_output)
     out_str = str(list(out_shape))
-
     context = (
         "chunk_result = torch.empty(%s, dtype=%s.dtype, device=%s.device); chunk_size = %d\nfor chunk_idx in range"
         % (out_str, input_node.name, input_node.name, chunk_size)
     )
     context += "(0, %d, chunk_size):\n" % (out_shape[chunk_ouput_dim])
-
-    # node = chunk_input[0]
-    # node_shape = node.meta["tensor_meta"].shape
-    # free_shape = [
-    #     node_shape[i] if i in chunk_dim else 1 for i in range(len(node_shape))
-    # ]
-    # chunk_dim = _get_first_non_single_dim(free_shape)
-    # chunk_slice = _gen_chunk_slice_dim(chunk_dim, "gen_chunk_idx", node_shape)
-    # out_shape = str(list(chunk_output.meta["tensor_meta"].shape))
-
-    # context = (
-    #     "chunk_result = torch.empty(%s, dtype=%s.dtype, device=%s.device); chunk_size = %d\nfor gen_chunk_idx in range"
-    #     % (out_shape, node.name, node.name, chunk_size)
-    # )
-    # context += "(0, %s.shape[%d], chunk_size):\n" % (node.name, chunk_dim)
-    # context += "    chunk_tensor = %s%s\n" % (node.name, chunk_slice)
     return context
 
 
-def _gen_loop_end(chunk_inputs, chunk_non_compute_inputs, chunk_outputs, chunk_outputs_dim, node_list):
+def _gen_loop_end(
+    chunk_inputs, chunk_non_compute_inputs, chunk_outputs, chunk_outputs_dim, node_list
+):
     chunk_outputs_name = chunk_outputs.name
     chunk_outputs_idx = _find_idx_by_name(chunk_outputs_name, node_list)
     chunk_output_shape = chunk_outputs.meta["tensor_meta"].shape
-    chunk_slice = _gen_chunk_slice_dim(chunk_outputs_dim, "chunk_idx", chunk_output_shape)
+    chunk_slice = _gen_chunk_slice_dim(
+        chunk_outputs_dim, "chunk_idx", chunk_output_shape
+    )
     context = "    chunk_result%s = %s\n" % (chunk_slice, chunk_outputs_name)
-    context += (chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None")
+    context += (
+        chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None"
+    )
 
     # determine if its the last use for chunk input
-    for chunk_input in (chunk_inputs + chunk_non_compute_inputs):
+    for chunk_input in chunk_inputs + chunk_non_compute_inputs:
         if all(
             [
                 _find_idx_by_name(user.name, node_list) <= chunk_outputs_idx
@@ -1456,10 +1446,7 @@ def _find_chunk_all_input_nodes(nodes: List[Node]):
     input_nodes = []
     for node in nodes:
         for input_node in node._input_nodes.keys():
-            if (
-                input_node not in nodes
-                and input_node not in input_nodes
-            ):
+            if input_node not in nodes and input_node not in input_nodes:
                 input_nodes.append(input_node)
     return input_nodes
 
@@ -1549,16 +1536,12 @@ def emit_code_with_chunk(
     chunk_inputs = [i["inputs"] for i in chunk_search]
     chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_search]
     chunk_inputs_dim = [i["inputs_dim"] for i in chunk_search]
-    chunk_inputs_idx = [
-        [_find_idx_by_name(j.name, node_list) for j in i] for i in chunk_inputs
+    chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [
+        j.name for i in chunk_inputs_non_chunk for j in i
     ]
-    chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [j.name for i in chunk_inputs_non_chunk for j in i]
 
     chunk_outputs = [i["outputs"][0] for i in chunk_search]
     chunk_outputs_dim = [i["outputs_dim"] for i in chunk_search]
-    chunk_outputs_idx = [
-        _find_idx_by_name(i.name, node_list) for i in chunk_outputs
-    ]
 
     node_idx = 0
     region_idx = 0
@@ -1586,7 +1569,9 @@ def emit_code_with_chunk(
             for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
                 for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
                     if idx == node_idx:
-                        chunk_slice = _gen_chunk_slice_dim(dim, "chunk_idx", _get_node_shape(input_node))    
+                        chunk_slice = _gen_chunk_slice_dim(
+                            dim, "chunk_idx", _get_node_shape(input_node)
+                        )
                         body[-1] = _replace_name(
                             body[-1], input_node.name, input_node.name + chunk_slice
                         )
@@ -1604,7 +1589,8 @@ def emit_code_with_chunk(
                     chunk_inputs[region_idx],
                     chunk_inputs_non_chunk[region_idx],
                     chunk_outputs[region_idx],
-                    chunk_outputs_dim[region_idx], node_list
+                    chunk_outputs_dim[region_idx],
+                    node_list,
                 )
             )
             within_chunk_region = False

From 5cdfcfe1d168e39d39a741112c036fa1455f0d06 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 12 Dec 2022 17:29:07 +0800
Subject: [PATCH 033/209] code style

---
 chunk_codegen.py | 49 ++++--------------------------------------------
 1 file changed, 4 insertions(+), 45 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 3bea84faeabb..96dcbfc0f79d 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -92,24 +92,10 @@ def _init_trace(self):
                 self._add_trace(i.name)
                 self._add_node(i.name, i)
 
-    def _is_non_compute_node(self, node):
-        if any(i in node.op for i in ["placeholder", "get_attr", "output"]) or any(
-            i in node.name for i in ["getitem", "getattr"]
-        ):
-            return True
-        return False
-
-    def _is_non_compute_node_except_placeholder(self, node):
-        if any(i in node.op for i in ["get_attr", "output"]) or any(
-            i in node.name for i in ["getitem", "getattr"]
-        ):
-            return True
-        return False
-
     def _find_flow_for_node(self, node):
         if type(self.node_list[0]) != type(node):
             return None
-        if self._is_non_compute_node_except_placeholder(node):
+        if _is_non_compute_node_except_placeholder(node):
             return None
         for name, trace in self.flow_trace.items():
             for i in trace:
@@ -135,7 +121,7 @@ def find_node_flow(self, node):
         raise RuntimeError("invalid node")
 
     def _get_flow_mix_node(self, node):
-        if self._is_non_compute_node(node):
+        if _is_non_compute_node(node):
             return None
         _, node_trace = self.find_node_flow(node)
         if len(node_trace["outside_depend"]) == 0:
@@ -160,10 +146,9 @@ def trace_flow(self):
         for node in self.node_list:
             # skip if non compute node
             if all(
-                type(arg) != type(node)
-                or self._is_non_compute_node_except_placeholder(arg)
+                type(arg) != type(node) or _is_non_compute_node_except_placeholder(arg)
                 for arg in node.args
-            ) or self._is_non_compute_node(node):
+            ) or _is_non_compute_node(node):
                 continue
 
             node_input_flows = [self._find_flow_for_node(arg) for arg in node.args]
@@ -1411,32 +1396,6 @@ def _gen_loop_end(
     return context
 
 
-def _find_input_and_output_nodes(nodes: List[Node]):
-    """
-    Find the input and output node names which are not found in the given list of nodes.
-    """
-    input_nodes = []
-    output_nodes = []
-
-    # if a node has an input node which is not in the node list
-    # we treat that input node as the input of the checkpoint function
-    for node in nodes:
-        for input_node in node._input_nodes.keys():
-            node_repr = repr(input_node)
-            if input_node not in nodes and input_node not in input_nodes:
-                input_nodes.append(input_node)
-
-    # if a node has a user node which is not in the node list
-    # we treat that user node as the node receiving the current node output
-    for node in nodes:
-        for output_node in node.users.keys():
-            node_repr = repr(node)
-            if output_node not in nodes and output_node not in output_nodes:
-                output_nodes.append(output_node)
-
-    return input_nodes, output_nodes
-
-
 def _find_chunk_all_input_nodes(nodes: List[Node]):
     """
     Find non-compute input and output node names.

From 8511d900a88638cb04ced2db35b171a96f6f310c Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 12 Dec 2022 17:36:17 +0800
Subject: [PATCH 034/209] code style

---
 chunk_codegen.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 96dcbfc0f79d..88d9178091b7 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1210,8 +1210,6 @@ def _check_duplicate_map(self, chunk_infos):
         return chunk_infos
 
     def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
-        if start_idx == 71 and end_idx == 126:
-            print(1)
         start_traces = input_trace[start_idx]
         end_trace = output_trace[end_idx]
         end_node = self.node_list[end_idx]
@@ -1347,15 +1345,6 @@ def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
     return new_shape
 
 
-def _get_first_non_single_dim(shape):
-    for idx, i in enumerate(shape):
-        if i == 1:
-            continue
-        else:
-            return idx
-    raise RuntimeError("can not get first non single dim for shape", shape)
-
-
 def _gen_loop_start(chunk_input, chunk_output, chunk_ouput_dim, chunk_size=2):
     input_node = chunk_input[0]
     out_shape = _get_node_shape(chunk_output)

From 98f9728e29f463692cea1533c998f0e7f2381e59 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 12 Dec 2022 18:15:47 +0800
Subject: [PATCH 035/209] code style

---
 chunk_codegen.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 88d9178091b7..22d48f5d661a 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -194,7 +194,7 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
                     if type(i) == type(mix_flow_node) and i != mix_flow_node:
                         main_flow_var = i
                 # if mix flow is a broadcast in chunk dim,
-                # TODO need to move that flow out of the chunk
+                # TODO: need to move that flow out of the chunk
                 mix_flow_node_dim = index_tracer._get_node_chunk_dim(
                     self.node_list[end_idx], end_dim, node
                 )
@@ -1200,7 +1200,7 @@ def _check_duplicate_map(self, chunk_infos):
                     continue
                 # it means an index create 2 copy of itself
                 # eg. a = torch.matmul(x, x.transpose(-1, -2))
-                # TODO currently remove it, deal with this in future
+                # TODO: currently remove it, deal with this in future
                 if input_dim1 == input_dim2 and output_dim1 != output_dim2:
                     remove_list.append(chunk_infos[idx1])
                     remove_list.append(chunk_infos[idx2])
@@ -1216,7 +1216,7 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
         chunk_infos = []
         for end_dim, end_trace_idx in enumerate(end_trace["idx"]):
             if len(start_traces) > 1:
-                # TODO implement multi input chunk
+                # TODO: implement multi input chunk
                 continue
             for start_node, start_trace in start_traces.items():
                 for start_dim, start_trace_idx in enumerate(start_trace["idx"]):
@@ -1421,7 +1421,7 @@ def _find_chunk_compute_input_and_output_nodes(nodes: List[Node]):
 
     # if a node has a user node which is not in the node list
     # we treat that user node as the node receiving the current node output
-    # TODO it is unsafe to remove non compute node here
+    # TODO: it is unsafe to remove non compute node here
     for node in nodes:
         for output_node in node.users.keys():
             if (

From 8754fa255376055c01aab4a3fab385454b8b7930 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 12 Dec 2022 18:25:47 +0800
Subject: [PATCH 036/209] change threshold

---
 chunk_codegen_run.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
index 88c734903392..99700e1af9d8 100644
--- a/chunk_codegen_run.py
+++ b/chunk_codegen_run.py
@@ -45,8 +45,9 @@ def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
     with torch.no_grad():
         non_fx_out = model(node, pair)
         fx_out = gm(node, pair)
-    assert torch.allclose(non_fx_out[0], fx_out[0], atol=1e-6), "fx_out doesn't comply with original output"
-    assert torch.allclose(non_fx_out[1], fx_out[1], atol=1e-6), "fx_out doesn't comply with original output"
+
+    assert torch.allclose(non_fx_out[0], fx_out[0], atol=1e-4), "fx_out doesn't comply with original output"
+    assert torch.allclose(non_fx_out[1], fx_out[1], atol=1e-4), "fx_out doesn't comply with original output"
 
     # test barckward
     # loss0 = non_fx_out[0].sum() + non_fx_out[1].sum()

From 1e0fd11bc1773ca47cbd95fb19b86517265390ce Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 13 Dec 2022 10:01:30 +0800
Subject: [PATCH 037/209] support check_index_duplicate

---
 chunk_codegen.py | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 22d48f5d661a..64bff4a801a1 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -179,7 +179,12 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
             "outputs_dim": end_dim,
             "args": {},
         }
-        flow_flag = False
+        flow_block = False
+        
+        # TODO don't allow multi outputs now
+        if len(outputs) > 1:
+            flow_block = True
+            return flow_block, chunk_info
 
         for idx in range(start_idx, end_idx + 1):
             node = self.node_list[idx]
@@ -199,10 +204,10 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
                     self.node_list[end_idx], end_dim, node
                 )
                 if mix_flow_node_dim is None:
-                    flow_flag = True
+                    flow_block = True
                     break
                 if _get_node_shape(mix_flow_node)[mix_flow_node_dim] == 1:
-                    flow_flag = False
+                    flow_block = False
                     for i in self._get_same_flow_node(
                         chunk_info["inputs"], mix_flow_node
                     ):
@@ -210,11 +215,15 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
                 # else, we need to chunk mix var as well
                 else:
                     # TODO chunk another value
-                    flow_flag = True
+                    flow_block = True
                     break
             else:
                 raise NotImplementedError("%s not implemented" % node.name)
 
+        if flow_block:
+            flow_block = True
+            return flow_block, chunk_info
+        
         inputs_dim = []
         remove_inputs = []
         for input_node in chunk_info["inputs"]:
@@ -250,7 +259,7 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
             if i not in chunk_info["inputs"]:
                 chunk_info["inputs_non_chunk"].append(i)
 
-        return flow_flag, chunk_info
+        return flow_block, chunk_info
 
 
 class IndexTracer(object):
@@ -869,14 +878,6 @@ def check_index_compute(self, start_idx, end_dim, end_node, end_idx):
         if any(start_idx <= i <= end_idx for i in end_node_compute):
             return False
         return True
-        # end_node_trace_source = end_node_trace['source'][end_dim]
-        # for node_idx, node_dim in end_node_trace_source.items():
-        #     if node_idx < start_node_idx or node_idx > end_node_idx:
-        #         continue
-        #     compute_list = self.idx_trace_list[node_idx]['compute'][node_dim]
-        #     if any(start_node_idx <= i <= end_node_idx for i in compute_list):
-        #         return False
-        # return True
 
     def _get_node_chunk_dim(self, node_from, node_from_dim, node_to):
         node_from_source = self._find_source_trace_from_node(node_from)
@@ -1240,10 +1241,10 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
                     ):
                         continue
                     # detect flow meet
-                    flow_flag, chunk_info = self.flow_tracer._detect_flow(
+                    flow_block, chunk_info = self.flow_tracer._detect_flow(
                         start_idx, start_dim, end_idx, end_dim, self.index_tracer
                     )
-                    if flow_flag:
+                    if flow_block:
                         continue
                     chunk_infos.append(chunk_info)
         chunk_infos = self._check_duplicate_map(chunk_infos)

From cda3e8572a8ab1f0c48342ad305fadbf892d62b2 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 13 Dec 2022 10:02:26 +0800
Subject: [PATCH 038/209] support index dupilictae and update loop

---
 chunk_codegen.py     | 109 +++++++++++++++++++++++++++++--------------
 chunk_codegen_run.py |   4 +-
 2 files changed, 76 insertions(+), 37 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 64bff4a801a1..b5bb8f18560a 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -180,7 +180,7 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
             "args": {},
         }
         flow_block = False
-        
+
         # TODO don't allow multi outputs now
         if len(outputs) > 1:
             flow_block = True
@@ -200,7 +200,7 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
                         main_flow_var = i
                 # if mix flow is a broadcast in chunk dim,
                 # TODO: need to move that flow out of the chunk
-                mix_flow_node_dim = index_tracer._get_node_chunk_dim(
+                mix_flow_node_dim = index_tracer.get_node_chunk_dim(
                     self.node_list[end_idx], end_dim, node
                 )
                 if mix_flow_node_dim is None:
@@ -223,7 +223,7 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
         if flow_block:
             flow_block = True
             return flow_block, chunk_info
-        
+
         inputs_dim = []
         remove_inputs = []
         for input_node in chunk_info["inputs"]:
@@ -234,7 +234,7 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
                 user_idx = _find_idx_by_name(user.name, self.node_list)
                 dim = None
                 if start_dim <= user_idx < end_idx:
-                    dim = index_tracer._get_node_chunk_dim(
+                    dim = index_tracer.get_node_chunk_dim(
                         self.node_list[end_idx], end_dim, input_node
                     )
                 elif user_idx == end_idx:
@@ -300,10 +300,10 @@ def _del_dim(self, idx, dim_idx):
         self.idx_trace_list[idx]["compute"].pop(dim_idx)
         self.idx_trace_list[idx]["source"].pop(dim_idx)
 
-    def _add_dim(self, idx, dim_idx):
-        self.idx_trace_list[idx]["idx"].insert(dim_idx, self._add_index())
-        self.idx_trace_list[idx]["compute"].insert(dim_idx, [])
-        self.idx_trace_list[idx]["source"].insert(dim_idx, {})
+    def _add_dim(self, node_idx, dim_idx):
+        self.idx_trace_list[node_idx]["idx"].insert(dim_idx, self._add_index())
+        self.idx_trace_list[node_idx]["compute"].insert(dim_idx, [])
+        self.idx_trace_list[node_idx]["source"].insert(dim_idx, {})
 
     def _transform_index(self, node, node_dim):
         node_idx = self._find_idx_trace_from_node(node)
@@ -659,9 +659,7 @@ def _assign_unsqueeze_index(self, node, node_idx):
         """
         self._del_dim(node_idx, -1)
         self._assign_index_as_input(node, node_idx)
-        self.idx_trace_list[node_idx]["idx"].insert(node.args[1], self._add_index())
-        self.idx_trace_list[node_idx]["compute"].insert(node.args[1], [])
-        self.idx_trace_list[node_idx]["source"].insert(node.args[1], [])
+        self._add_dim(node_idx, node.args[1])
 
     def _assign_dropout_index(self, node, node_idx):
         """
@@ -879,7 +877,7 @@ def check_index_compute(self, start_idx, end_dim, end_node, end_idx):
             return False
         return True
 
-    def _get_node_chunk_dim(self, node_from, node_from_dim, node_to):
+    def get_node_chunk_dim(self, node_from, node_from_dim, node_to):
         node_from_source = self._find_source_trace_from_node(node_from)
         dim_source = node_from_source[node_from_dim]
         node_to_idx = _find_idx_by_name(node_to.name, self.nodes_list)
@@ -888,6 +886,44 @@ def _get_node_chunk_dim(self, node_from, node_from_dim, node_to):
                 return v
         return None
 
+    def _find_inherit_dim(self, input_node, input_dim, node):
+        input_node_idx = _find_idx_by_name(input_node.name, self.nodes_list)
+        node_idx = _find_idx_by_name(node.name, self.nodes_list)
+        node_trace_source = self._find_source_trace_from_node(node)
+        for node_dim in range(len(_get_node_shape(node))):
+            if (
+                input_node_idx in node_trace_source[node_dim]
+                and node_trace_source[node_dim][input_node_idx] == input_dim
+            ):
+                return {node_idx: node_dim}
+        return {}
+
+    def check_index_duplicate(self, chunk_infos):
+        input_dim_after_node = {}
+        for input_node_idx, input_node in enumerate(chunk_infos["inputs"]):
+            for k, v in chunk_infos["inputs_dim"][input_node_idx].items():
+                input_dim_after_node.update(
+                    self._find_inherit_dim(input_node, v, self.nodes_list[k])
+                )
+
+        for node in self.nodes_list[
+            chunk_infos["region"][0] : chunk_infos["region"][1] + 1
+        ]:
+            if _is_non_compute_node_except_placeholder(node):
+                continue
+            count = 0
+            node_trace_source = self._find_source_trace_from_node(node)
+            for node_dim in range(len(_get_node_shape(node))):
+                dim_source = node_trace_source[node_dim]
+                for k, v in dim_source.items():
+                    if chunk_infos["region"][0] <= k <= chunk_infos["region"][1]:
+                        if k in input_dim_after_node and input_dim_after_node[k] == v:
+                            count += 1
+                            break
+            if count > 1:
+                return False
+        return True
+
 
 class MemoryEstimator(object):
     def __init__(self) -> None:
@@ -1160,7 +1196,7 @@ def _get_min_free_var(self, active_node_list, free_vars):
                 min_len = len(n)
         return min_len
 
-    def _search_max_chunk_region(self, active_node, peak_node):
+    def _search_max_chunk_region(self, active_node, peak_node, chunk_regions):
         free_vars = self._get_free_var()
         min_var = self._get_min_free_var(active_node, free_vars)
 
@@ -1180,6 +1216,21 @@ def _search_max_chunk_region(self, active_node, peak_node):
                 break
             if i in free_vars or i == 0:
                 raise RuntimeError()
+
+        for i in chunk_regions:
+            region = i["region"]
+            if chunk_region_start >= region[0] and chunk_region_end <= region[1]:
+                return None
+            elif (
+                region[0] <= chunk_region_start <= region[1]
+                and chunk_region_end > region[1]
+            ):
+                chunk_region_start = region[1] + 1
+            elif (
+                region[0] <= chunk_region_end <= region[1]
+                and chunk_region_start < region[0]
+            ):
+                chunk_region_end = region[0] - 1
         return chunk_region_start, chunk_region_end
 
     def _is_not_compute(self, trace, chunk_range, dim_idx):
@@ -1192,24 +1243,6 @@ def _is_not_compute(self, trace, chunk_range, dim_idx):
             return True
         return False
 
-    def _check_duplicate_map(self, chunk_infos):
-        dim_map = [(i["inputs_dim"], i["outputs_dim"]) for i in chunk_infos]
-        remove_list = []
-        for idx1, (input_dim1, output_dim1) in enumerate(dim_map):
-            for idx2, (input_dim2, output_dim2) in enumerate(dim_map):
-                if idx1 == idx2:
-                    continue
-                # it means an index create 2 copy of itself
-                # eg. a = torch.matmul(x, x.transpose(-1, -2))
-                # TODO: currently remove it, deal with this in future
-                if input_dim1 == input_dim2 and output_dim1 != output_dim2:
-                    remove_list.append(chunk_infos[idx1])
-                    remove_list.append(chunk_infos[idx2])
-        for i in remove_list:
-            if i in chunk_infos:
-                chunk_infos.remove(i)
-        return chunk_infos
-
     def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
         start_traces = input_trace[start_idx]
         end_trace = output_trace[end_idx]
@@ -1246,8 +1279,10 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
                     )
                     if flow_block:
                         continue
+                    # check index copmute
+                    if not self.index_tracer.check_index_duplicate(chunk_info):
+                        continue
                     chunk_infos.append(chunk_info)
-        chunk_infos = self._check_duplicate_map(chunk_infos)
         return chunk_infos
 
     def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
@@ -1288,9 +1323,13 @@ def _search_best_chunk_region(self, possible_chunk_regions):
                 max_region_range = i["region"][1] - i["region"][0]
         return best_regions
 
-    def _step_search(self, mem_peak, active_node):
+    def _step_search(self, mem_peak, active_node, chunk_regions):
         peak_node = self._find_peak_node(mem_peak)
-        max_chunk_region = self._search_max_chunk_region(active_node, peak_node)
+        max_chunk_region = self._search_max_chunk_region(
+            active_node, peak_node, chunk_regions
+        )
+        if max_chunk_region == None:
+            return None
         possible_chunk_regions = self._search_possible_chunk_regions(
             max_chunk_region, peak_node
         )
@@ -1313,7 +1352,7 @@ def search_region(self):
         mem_peak = init_mem_peak
 
         while True:
-            chunk_region = self._step_search(mem_peak, active_node)
+            chunk_region = self._step_search(mem_peak, active_node, chunk_regions)
             if chunk_region is None:
                 break
 
diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
index 99700e1af9d8..ae4653d6545b 100644
--- a/chunk_codegen_run.py
+++ b/chunk_codegen_run.py
@@ -46,8 +46,8 @@ def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
         non_fx_out = model(node, pair)
         fx_out = gm(node, pair)
 
-    assert torch.allclose(non_fx_out[0], fx_out[0], atol=1e-4), "fx_out doesn't comply with original output"
-    assert torch.allclose(non_fx_out[1], fx_out[1], atol=1e-4), "fx_out doesn't comply with original output"
+    assert torch.allclose(non_fx_out[0], fx_out[0], atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(torch.abs(non_fx_out[0] - fx_out[0]))
+    assert torch.allclose(non_fx_out[1], fx_out[1], atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(torch.abs(non_fx_out[1] - fx_out[1]))
 
     # test barckward
     # loss0 = non_fx_out[0].sum() + non_fx_out[1].sum()

From de65e6c3e88bc1b217b894bf20a4769748145605 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 13 Dec 2022 11:00:51 +0800
Subject: [PATCH 039/209] support output

---
 chunk_codegen.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index b5bb8f18560a..79cefddf07d2 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -56,6 +56,14 @@ def _is_non_compute_node_except_placeholder(node):
     return False
 
 
+def _is_non_compute_node_except_placeholder_output(node):
+    if any(i in node.op for i in ["get_attr"]) or any(
+        i in node.name for i in ["getitem", "getattr"]
+    ):
+        return True
+    return False
+
+
 class FlowTracer(object):
     def __init__(self, gm) -> None:
         self.gm = gm
@@ -1083,13 +1091,14 @@ def estimate_chunk_inference_mem(
             i is not None for i in [start_nodes, end_nodes, chunk_dims, chunk_sizes]
         )
         chunk_within = False
-        chunk_region_idx = 0
+        chunk_region_idx = None
         chunk_ratio = 1  # use it to estimate chunk mem
 
         for idx, node in enumerate(node_list):
             # if node in chunk start nodes, change chunk ratio and add chunk_tensor
             if use_chunk and idx in start_nodes:
                 chunk_within = True
+                chunk_region_idx = start_nodes.index(idx)
                 chunk_ratio = self._get_chunk_ratio(
                     node, chunk_dims[chunk_region_idx], chunk_sizes[chunk_region_idx]
                 )
@@ -1149,7 +1158,7 @@ def estimate_chunk_inference_mem(
                 )
                 chunk_within = False
                 chunk_ratio = 1
-                chunk_region_idx += 1
+                chunk_region_idx = None
 
             act_memory_after_node_log.append(act_memory)
             active_node_list_log.append(copy.deepcopy(active_node_list))
@@ -1467,7 +1476,7 @@ def _find_chunk_compute_input_and_output_nodes(nodes: List[Node]):
             if (
                 output_node not in nodes
                 and node not in output_nodes
-                and not _is_non_compute_node_except_placeholder(output_node)
+                and not _is_non_compute_node_except_placeholder_output(output_node)
             ):
                 output_nodes.append(node)
 

From e83e3c615452c5f8ab04f558880c378256d95802 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 16 Dec 2022 11:09:35 +0800
Subject: [PATCH 040/209] update memory estimate

---
 chunk_codegen.py | 177 +++++++++++++++++++++++++++++------------------
 1 file changed, 111 insertions(+), 66 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 79cefddf07d2..18d9a0c8d764 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -896,23 +896,22 @@ def get_node_chunk_dim(self, node_from, node_from_dim, node_to):
 
     def _find_inherit_dim(self, input_node, input_dim, node):
         input_node_idx = _find_idx_by_name(input_node.name, self.nodes_list)
-        node_idx = _find_idx_by_name(node.name, self.nodes_list)
         node_trace_source = self._find_source_trace_from_node(node)
         for node_dim in range(len(_get_node_shape(node))):
             if (
                 input_node_idx in node_trace_source[node_dim]
                 and node_trace_source[node_dim][input_node_idx] == input_dim
             ):
-                return {node_idx: node_dim}
-        return {}
+                return node_dim
+        return None
 
     def check_index_duplicate(self, chunk_infos):
         input_dim_after_node = {}
         for input_node_idx, input_node in enumerate(chunk_infos["inputs"]):
             for k, v in chunk_infos["inputs_dim"][input_node_idx].items():
-                input_dim_after_node.update(
-                    self._find_inherit_dim(input_node, v, self.nodes_list[k])
-                )
+                inherit_dim = self._find_inherit_dim(input_node, v, self.nodes_list[k])
+                if inherit_dim:
+                    input_dim_after_node[k] = inherit_dim
 
         for node in self.nodes_list[
             chunk_infos["region"][0] : chunk_infos["region"][1] + 1
@@ -934,8 +933,8 @@ def check_index_duplicate(self, chunk_infos):
 
 
 class MemoryEstimator(object):
-    def __init__(self) -> None:
-        pass
+    def __init__(self, index_tracer: IndexTracer) -> None:
+        self.index_tracer = index_tracer
 
     def _get_meta_node_size(self, x):
         x = x.meta["tensor_meta"]
@@ -950,6 +949,8 @@ def _get_output_node(self, n):
         }
         out_size = activation_size(fwd_out)
         out_node = [n.name] if out_size > 0 else []
+        # if any(i in n.name for i in ['transpose', 'permute', 'view']):
+        #     out_size = 0
         return out_size, out_node
 
     def _get_output_node_size(self, n):
@@ -961,11 +962,19 @@ def _add_active_node(self, n, active_list):
             if i not in active_list:
                 active_list.append(i)
 
-    def _get_delete_node(self, user, user_to_last_uses):
+    def _get_delete_node(self, user, user_to_last_uses, to_keep=None):
         delete_size = 0
         delete_node = []
         if user.op not in ("placeholder", "output"):
             nodes_to_delete = user_to_last_uses.get(user, [])
+            if to_keep is not None:
+                keep_list = []
+                for n in nodes_to_delete:
+                    if n.name in to_keep:
+                        keep_list.append(n)
+                for n in keep_list:
+                    if n in nodes_to_delete:
+                        nodes_to_delete.remove(n)
             if len(nodes_to_delete):
                 out_node = [self._get_output_node(i) for i in nodes_to_delete]
                 delete_size = sum([i[0] for i in out_node])
@@ -974,15 +983,30 @@ def _get_delete_node(self, user, user_to_last_uses):
                         delete_node.append(out_node[i][1][0])
                     elif nodes_to_delete[i].op == "placeholder":
                         delete_node.append(nodes_to_delete[i].name)
+                    # elif any(j in nodes_to_delete[i].name for j in ['transpose', 'permute', 'view']):
+                    #     delete_node.append(nodes_to_delete[i].name)
         return delete_size, delete_node
 
-    def _get_delete_node_size(self, user, user_to_last_uses):
-        return self._get_delete_node(user, user_to_last_uses)[0]
+    def _get_delete_node_size(self, user, user_to_last_uses, to_keep):
+        return self._get_delete_node(user, user_to_last_uses, to_keep)[0]
 
     def _remove_deactive_node(self, user, user_to_last_uses, active_list):
         delete_node = self._get_delete_node(user, user_to_last_uses)[1]
         for i in delete_node:
-            active_list.remove(i)
+            if i in active_list:
+                active_list.remove(i)
+    
+    def _get_chunk_inputs_size(self, chunk_inputs, chunk_inputs_non_chunk, node_list, chunk_end_idx):
+        nodes_to_delete = []
+        for chunk_input in chunk_inputs + chunk_inputs_non_chunk:
+            chunk_input_users = chunk_input.users.keys()
+            chunk_input_users_idx = [_find_idx_by_name(i.name, node_list) for i in chunk_input_users]
+            if all(i <= chunk_end_idx for i in chunk_input_users_idx):
+                if chunk_input not in nodes_to_delete:
+                    nodes_to_delete.append(chunk_input)
+        out_node = [self._get_output_node(i) for i in nodes_to_delete]
+        delete_size = sum([i[0] for i in out_node])
+        return delete_size
 
     def _get_last_usr(self, nodes):
         node_to_last_use: Dict[Node, Node] = {}
@@ -1000,7 +1024,8 @@ def register_last_uses(n: Node, user: Node):
 
     def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
         mem = 0
-        not_contiguous_ops = ["transpose", "permute"]
+        not_contiguous_ops = ["permute"]
+        inherit_contiguous_ops = ["transpose", "view"]
 
         if node.op == "call_function" and any(
             n in node.name for n in ["matmul", "reshape"]
@@ -1020,30 +1045,36 @@ def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
         ):
             if node not in not_contiguous_list:
                 not_contiguous_list.append(node)
-        elif any(i in node.args for i in not_contiguous_list):
-            if node not in not_contiguous_list:
-                not_contiguous_list.append(node)
-
         return mem
 
-    def _get_chunk_ratio(self, node, chunk_dim, chunk_size):
-        sorted_dim = sorted(chunk_dim, key=lambda x: list(x.keys())[0])
-        dim = list(sorted_dim[-1].values())[0]
-        shape = node.meta["tensor_meta"].shape
-        chunk_ratio = float(chunk_size) / shape[dim]
-        return chunk_ratio
+    def _get_chunk_ratio(self, node, chunk_inputs, chunk_inputs_dim, chunk_size):
+        node_shape = _get_node_shape(node)
+        node_source = self.index_tracer._find_source_trace_from_node(node)
+        for (input_node, input_node_dim) in zip(chunk_inputs, chunk_inputs_dim):
+            for k, v in input_node_dim.items():
+                inherit_dim = self.index_tracer._find_inherit_dim(input_node, v, self.index_tracer.nodes_list[k])
+                if k == _find_idx_by_name(node.name, self.index_tracer.nodes_list):
+                    chunk_ratio = float(chunk_size) / node_shape[inherit_dim]
+                    return chunk_ratio
+                for dim, source in enumerate(node_source):
+                    if k in source and source[k] == inherit_dim:
+                        chunk_ratio = float(chunk_size) / node_shape[dim]
+                        return chunk_ratio
+        return 1.
 
     def _get_chunk_delete_node_size(
-        self, user, user_to_last_uses, chunk_ratio, node_list, start_node, end_node
+        self, user, user_to_last_uses, chunk_ratio, chunk_inputs_names
     ):
+        # if any(j in user.name for j in ['transpose', 'permute', 'view']):
+        #     return 0
         if user.op in ("placeholder", "output"):
             return 0
         nodes_to_delete = user_to_last_uses.get(user, [])
         delete_size = 0
         for n in nodes_to_delete:
-            node_idx = _find_idx_by_name(n.name, node_list)
-            if start_node <= node_idx < end_node:
-                delete_size += self._get_output_node_size(n) * chunk_ratio
+            if n.name in chunk_inputs_names:
+                continue
+            delete_size += self._get_output_node_size(n) * chunk_ratio
         return delete_size
 
     def _print_mem_log(self, log, nodes, title=None):
@@ -1071,10 +1102,7 @@ def _print_compute_op_mem_log(self, log, nodes, title=None):
     def estimate_chunk_inference_mem(
         self,
         gm: torch.fx.GraphModule,
-        start_nodes=None,
-        end_nodes=None,
-        chunk_dims=None,
-        chunk_sizes=None,
+        chunk_infos=None,
     ):
         act_memory = 0.0
         act_memory_peak_log = []
@@ -1087,36 +1115,53 @@ def estimate_chunk_inference_mem(
         user_to_last_uses_no_free_var = self._get_last_usr(node_list)
         _delete_free_var_from_last_use(user_to_last_uses_no_free_var)
 
-        use_chunk = all(
-            i is not None for i in [start_nodes, end_nodes, chunk_dims, chunk_sizes]
-        )
+        use_chunk = True if chunk_infos is not None else False
         chunk_within = False
         chunk_region_idx = None
         chunk_ratio = 1  # use it to estimate chunk mem
+        chunk_size = 1
+        chunk_inputs_names = []
+        
+        if use_chunk:
+            chunk_regions = [i["region"] for i in chunk_infos]
+            chunk_starts = [i[0] for i in chunk_regions]
+            chunk_ends = [i[1] for i in chunk_regions]
+            chunk_inputs = [i["inputs"] for i in chunk_infos]
+            chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos]
+            chunk_inputs_dim = [i["inputs_dim"] for i in chunk_infos]
+            chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [
+                j.name for i in chunk_inputs_non_chunk for j in i
+            ]
+            chunk_outputs = [i["outputs"][0] for i in chunk_infos]
 
         for idx, node in enumerate(node_list):
             # if node in chunk start nodes, change chunk ratio and add chunk_tensor
-            if use_chunk and idx in start_nodes:
+            if use_chunk and idx in chunk_starts:
                 chunk_within = True
-                chunk_region_idx = start_nodes.index(idx)
+                chunk_region_idx = chunk_starts.index(idx)
+                act_memory += self._get_output_node_size(chunk_outputs[chunk_region_idx]) / (1024**2)
+
+            # determine chunk ratio for current node
+            if chunk_within:
                 chunk_ratio = self._get_chunk_ratio(
-                    node, chunk_dims[chunk_region_idx], chunk_sizes[chunk_region_idx]
+                    node, chunk_inputs[chunk_region_idx], chunk_inputs_dim[chunk_region_idx], chunk_size
                 )
-                act_memory += self._get_output_node_size(
-                    node_list[end_nodes[chunk_region_idx]]
-                ) / (1024**2)
 
             # if node is placeholder, just add the size of the node
             if node.op == "placeholder":
                 act_memory += self._get_meta_node_size(node) * chunk_ratio / (1024**2)
                 act_memory_peak_log.append(act_memory)
-                active_node_list.append(node.name)
             # skip output
             elif node.op == "output":
                 continue
-            # node is an operation, calculate tmp, output node and delete node memory
+            # no change for non compute node
+            elif _is_non_compute_node_except_placeholder(node):
+                act_memory_peak_log.append(act_memory)
+            # node is a compute op
+            # calculate tmp, output node and delete node memory
             else:
                 # forward memory
+                # TODO: contiguous_memory still not accurate for matmul, view, reshape and transpose
                 act_memory += (
                     self._get_contiguous_memory(node, not_contiguous_list)
                     * chunk_ratio
@@ -1133,29 +1178,35 @@ def estimate_chunk_inference_mem(
                     * chunk_ratio
                     / (1024**2)
                 )
+                # delete unused vars not in chunk_input_list
+                # we can't delete input nodes until chunk ends 
                 if chunk_within:
                     act_memory -= self._get_chunk_delete_node_size(
                         node,
                         user_to_last_uses_no_free_var,
                         chunk_ratio,
-                        node_list,
-                        start_nodes[chunk_region_idx],
-                        end_nodes[chunk_region_idx],
+                        chunk_inputs_names
                     ) / (1024**2)
                 else:
-                    act_memory -= self._get_delete_node_size(
-                        node, user_to_last_uses_no_free_var
-                    ) / (1024**2)
+                    act_memory -= (self._get_delete_node_size(
+                        node, user_to_last_uses_no_free_var, chunk_inputs_names
+                    ) / (1024**2))
 
-            # log active node
+            # log active node, only effective without chunk
             self._add_active_node(node, active_node_list)
             self._remove_deactive_node(node, user_to_last_uses, active_node_list)
 
             # if node in chunk end nodes, restore chunk settings
-            if use_chunk and idx in end_nodes:
+            if use_chunk and idx in chunk_ends:
                 act_memory -= (
                     self._get_output_node_size(node) * chunk_ratio / (1024**2)
                 )
+                act_memory -= self._get_chunk_inputs_size(
+                    chunk_inputs[chunk_region_idx], 
+                    chunk_inputs_non_chunk[chunk_region_idx], 
+                    node_list,
+                    chunk_regions[chunk_region_idx][1]
+                    ) / (1024**2)
                 chunk_within = False
                 chunk_ratio = 1
                 chunk_region_idx = None
@@ -1178,11 +1229,11 @@ class ChunkRegionSearch(object):
     def __init__(self, gm) -> None:
         self.gm = gm
         self.node_list = list(gm.graph.nodes)
-        self.memory_estimator = MemoryEstimator()
         self.index_tracer = IndexTracer(gm)
         self.index_tracer.trace_index()
         self.flow_tracer = FlowTracer(gm)
         self.flow_tracer.trace_flow()
+        self.memory_estimator = MemoryEstimator(self.index_tracer)
 
     def _find_peak_node(self, mem_peak):
         max_value = max(mem_peak)
@@ -1210,7 +1261,7 @@ def _search_max_chunk_region(self, active_node, peak_node, chunk_regions):
         min_var = self._get_min_free_var(active_node, free_vars)
 
         # from peak_node to free_var
-        chunk_region_start = None
+        chunk_region_start = len(free_vars)
         for i in range(peak_node, -1, -1):
             if len(active_node[i]) == min_var:
                 chunk_region_start = i + 1
@@ -1218,7 +1269,7 @@ def _search_max_chunk_region(self, active_node, peak_node, chunk_regions):
             if i in free_vars or i == 0:
                 raise RuntimeError()
         # from peak_node to len-2
-        chunk_region_end = None
+        chunk_region_end = len(active_node) - 1
         for i in range(peak_node, len(active_node)):
             if len(active_node[i]) == min_var:
                 chunk_region_end = i
@@ -1352,7 +1403,7 @@ def _stop_search(self, init_mem_peak, mem_peak):
         return False
 
     def search_region(self):
-        chunk_regions = []
+        chunk_infos = []
         (
             init_mem_peak,
             _,
@@ -1361,25 +1412,19 @@ def search_region(self):
         mem_peak = init_mem_peak
 
         while True:
-            chunk_region = self._step_search(mem_peak, active_node, chunk_regions)
-            if chunk_region is None:
+            chunk_info = self._step_search(mem_peak, active_node, chunk_infos)
+            if chunk_info is None:
                 break
 
-            chunk_regions.append(chunk_region)
+            chunk_infos.append(chunk_info)
             (
                 mem_peak,
                 _,
                 active_node,
-            ) = self.memory_estimator.estimate_chunk_inference_mem(
-                self.gm,
-                [i["region"][0] for i in chunk_regions],
-                [i["region"][1] for i in chunk_regions],
-                [i["inputs_dim"] for i in chunk_regions],
-                [1] * len(chunk_regions),
-            )
+            ) = self.memory_estimator.estimate_chunk_inference_mem(self.gm, chunk_infos)
             if self._stop_search(init_mem_peak, mem_peak):
                 break
-        return chunk_regions
+        return chunk_infos
 
 
 def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
@@ -1415,7 +1460,7 @@ def _gen_loop_end(
     chunk_slice = _gen_chunk_slice_dim(
         chunk_outputs_dim, "chunk_idx", chunk_output_shape
     )
-    context = "    chunk_result%s = %s\n" % (chunk_slice, chunk_outputs_name)
+    context = "    chunk_result%s = %s;  %s = None\n" % (chunk_slice, chunk_outputs_name, chunk_outputs_name)
     context += (
         chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None"
     )

From e66a18a0bfaa87767d5869ab21a76c48af8b81cf Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 16 Dec 2022 15:06:39 +0800
Subject: [PATCH 041/209] optimise search

---
 chunk_codegen.py | 67 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 47 insertions(+), 20 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 18d9a0c8d764..5e2130ee76f4 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -958,6 +958,8 @@ def _get_output_node_size(self, n):
 
     def _add_active_node(self, n, active_list):
         new_active = self._get_output_node(n)[1]
+        if n.op == 'placeholder':
+            new_active.append(n.name)
         for i in new_active:
             if i not in active_list:
                 active_list.append(i)
@@ -965,7 +967,7 @@ def _add_active_node(self, n, active_list):
     def _get_delete_node(self, user, user_to_last_uses, to_keep=None):
         delete_size = 0
         delete_node = []
-        if user.op not in ("placeholder", "output"):
+        if user.op not in ("output",):
             nodes_to_delete = user_to_last_uses.get(user, [])
             if to_keep is not None:
                 keep_list = []
@@ -1258,24 +1260,30 @@ def _get_min_free_var(self, active_node_list, free_vars):
 
     def _search_max_chunk_region(self, active_node, peak_node, chunk_regions):
         free_vars = self._get_free_var()
-        min_var = self._get_min_free_var(active_node, free_vars)
-
+        free_var_num = len(free_vars)
+        active_node_num = [len(i) for i in active_node]
+        min_active_node_num = min(active_node_num[free_var_num:])
+        threshold = max(free_var_num, min_active_node_num)
+        
         # from peak_node to free_var
-        chunk_region_start = len(free_vars)
+        inside_flag = False
+        chunk_region_start = free_var_num
         for i in range(peak_node, -1, -1):
-            if len(active_node[i]) == min_var:
+            if active_node_num[i] <= threshold:
+                inside_flag = True
+            if inside_flag and active_node_num[i] > threshold:
                 chunk_region_start = i + 1
                 break
-            if i in free_vars or i == 0:
-                raise RuntimeError()
+
         # from peak_node to len-2
+        inside_flag = False
         chunk_region_end = len(active_node) - 1
         for i in range(peak_node, len(active_node)):
-            if len(active_node[i]) == min_var:
+            if active_node_num[i] <= threshold:
+                inside_flag = True
+            if inside_flag and active_node_num[i] > threshold:
                 chunk_region_end = i
                 break
-            if i in free_vars or i == 0:
-                raise RuntimeError()
 
         for i in chunk_regions:
             region = i["region"]
@@ -1374,15 +1382,34 @@ def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
                     possible_chunk_region.extend(chunk_info)
         return possible_chunk_region
 
-    def _search_best_chunk_region(self, possible_chunk_regions):
+    def _search_best_chunk_region(self, possible_chunk_regions, chunk_infos):
         max_region_range = 0
-        best_regions = None
-        for i in possible_chunk_regions:
-            if i["region"][1] - i["region"][0] > max_region_range:
-                best_regions = i
-                max_region_range = i["region"][1] - i["region"][0]
-        return best_regions
-
+        best_region = None
+        while len(possible_chunk_regions) > 0:
+            for i in possible_chunk_regions:
+                if i["region"][1] - i["region"][0] > max_region_range:
+                    best_region = i
+                    max_region_range = i["region"][1] - i["region"][0]
+            if self._is_legal_region(best_region, chunk_infos):
+                break
+            possible_chunk_regions.remove(i)
+            max_region_range = 0
+            best_region = None
+        return best_region
+    
+    def _is_legal_region(self, cur_chunk_info, chunk_infos):
+        (chunk_region_start, chunk_region_end) = cur_chunk_info["region"]
+        if cur_chunk_info in chunk_infos:
+            return False
+        if chunk_region_end < chunk_region_start:
+            return False
+        for i in chunk_infos:
+            region = i["region"]
+            if not ((chunk_region_start > region[1] and chunk_region_end > region[1]) 
+                    or (chunk_region_start < region[0] and chunk_region_end < region[0])):
+                return False
+        return True
+    
     def _step_search(self, mem_peak, active_node, chunk_regions):
         peak_node = self._find_peak_node(mem_peak)
         max_chunk_region = self._search_max_chunk_region(
@@ -1393,7 +1420,7 @@ def _step_search(self, mem_peak, active_node, chunk_regions):
         possible_chunk_regions = self._search_possible_chunk_regions(
             max_chunk_region, peak_node
         )
-        best_chunk_region = self._search_best_chunk_region(possible_chunk_regions)
+        best_chunk_region = self._search_best_chunk_region(possible_chunk_regions, chunk_regions)
         return best_chunk_region
 
     def _stop_search(self, init_mem_peak, mem_peak):
@@ -1919,5 +1946,5 @@ def emit_node(node: Node, body):
 
 {prologue}
 {code}"""
-            print(fn_code)
+            # print(fn_code)
             return PythonCode(fn_code, globals_)

From 9d516fa68f4e029d63b53d78803667bfa71e86d6 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Sun, 18 Dec 2022 20:37:55 +0800
Subject: [PATCH 042/209] fix layernorm

---
 chunk_codegen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 5e2130ee76f4..77c28fd32c88 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -574,7 +574,7 @@ def _assign_layernorm_index(self, node, idx):
             node_idx (int)
         """
         self._assign_index_as_input(node, idx)
-        self._mark_computation(node, idx, [-1, -2])
+        self._mark_computation(node, idx, [-1])
 
     def _assign_elementwise_index(self, node, idx):
         """

From d734529a390087f1366b7573410eca5775735b14 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Wed, 21 Dec 2022 15:00:24 +0800
Subject: [PATCH 043/209] move flow tracer

---
 chunk_codegen.py | 413 ++++++++++++++++++++++++-----------------------
 1 file changed, 207 insertions(+), 206 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 77c28fd32c88..2c1c09ae5238 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -64,212 +64,6 @@ def _is_non_compute_node_except_placeholder_output(node):
     return False
 
 
-class FlowTracer(object):
-    def __init__(self, gm) -> None:
-        self.gm = gm
-        self.node_list = list(gm.graph.nodes)
-        self.flow_trace = {}
-
-    def _add_trace(self, name):
-        self.flow_trace[name] = []
-
-    def _add_node(self, trace_name, node):
-        self.flow_trace[trace_name].append(
-            {"node": node, "inside_depend": [], "outside_depend": []}
-        )
-
-    def _add_inside_depend(self, flow_name, node, inside_depend_node):
-        for i in self.flow_trace[flow_name]:
-            if i["node"] == node:
-                i["inside_depend"].append(inside_depend_node)
-                return
-        raise RuntimeError("node not found")
-
-    def _add_outside_depend(
-        self, flow_name, node, outside_depend_node, outside_depend_trace
-    ):
-        for i in self.flow_trace[flow_name]:
-            if i["node"] == node:
-                i["outside_depend"].append({outside_depend_trace: outside_depend_node})
-                return
-        raise RuntimeError("node not found")
-
-    def _init_trace(self):
-        for i in self.node_list:
-            if i.op == "placeholder":
-                self._add_trace(i.name)
-                self._add_node(i.name, i)
-
-    def _find_flow_for_node(self, node):
-        if type(self.node_list[0]) != type(node):
-            return None
-        if _is_non_compute_node_except_placeholder(node):
-            return None
-        for name, trace in self.flow_trace.items():
-            for i in trace:
-                if node == i["node"]:
-                    return name
-        if any(i in node.name for i in ["ones_like"]):
-            self._add_trace(node.name)
-            self._add_node(node.name, node)
-            return node.name
-        raise RuntimeError("node not found")
-
-    def _find_first_valid_flow(self, flow):
-        for i in flow:
-            if i is not None:
-                return i
-        raise RuntimeError("invalid flow")
-
-    def find_node_flow(self, node):
-        for name, trace in self.flow_trace.items():
-            for i in trace:
-                if node == i["node"]:
-                    return name, i
-        raise RuntimeError("invalid node")
-
-    def _get_flow_mix_node(self, node):
-        if _is_non_compute_node(node):
-            return None
-        _, node_trace = self.find_node_flow(node)
-        if len(node_trace["outside_depend"]) == 0:
-            return None
-        elif len(node_trace["outside_depend"]) > 1:
-            raise NotImplementedError
-        vars = list(node_trace["outside_depend"][0].values())[0]
-        return vars
-
-    def _get_same_flow_node(self, node_list, node):
-        name, _ = self.find_node_flow(node)
-        result = []
-        for i in self.flow_trace[name]:
-            if i["node"] in node_list:
-                result.append(i["node"])
-        return result
-
-    def trace_flow(self):
-        # init trace
-        self._init_trace()
-
-        for node in self.node_list:
-            # skip if non compute node
-            if all(
-                type(arg) != type(node) or _is_non_compute_node_except_placeholder(arg)
-                for arg in node.args
-            ) or _is_non_compute_node(node):
-                continue
-
-            node_input_flows = [self._find_flow_for_node(arg) for arg in node.args]
-
-            node_domin_flow = self._find_first_valid_flow(node_input_flows)
-            self._add_node(node_domin_flow, node)
-            for node_input_flow, arg in zip(node_input_flows, node.args):
-                if node_input_flow is None:
-                    continue
-                elif node_input_flow == node_domin_flow:
-                    self._add_inside_depend(node_domin_flow, node, arg)
-                else:
-                    self._add_outside_depend(
-                        node_domin_flow, node, arg, node_input_flow
-                    )
-        return self.flow_trace
-
-    def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer):
-        inputs, outputs = _find_chunk_compute_input_and_output_nodes(
-            self.node_list[start_idx : end_idx + 1]
-        )
-        chunk_info = {
-            "region": (start_idx, end_idx),
-            "inputs": inputs,
-            "inputs_non_chunk": [],
-            "inputs_dim": start_dim,
-            "outputs": outputs,
-            "outputs_dim": end_dim,
-            "args": {},
-        }
-        flow_block = False
-
-        # TODO don't allow multi outputs now
-        if len(outputs) > 1:
-            flow_block = True
-            return flow_block, chunk_info
-
-        for idx in range(start_idx, end_idx + 1):
-            node = self.node_list[idx]
-            mix_flow_node = self._get_flow_mix_node(node)
-            if mix_flow_node is None:
-                continue
-
-            # if there is a flow mix, op must be in [mul, add, matmul]
-            # element-wise op requires dim to be equal in every dim
-            if any(n in node.name for n in ["mul", "add"]):
-                for i in node.args:
-                    if type(i) == type(mix_flow_node) and i != mix_flow_node:
-                        main_flow_var = i
-                # if mix flow is a broadcast in chunk dim,
-                # TODO: need to move that flow out of the chunk
-                mix_flow_node_dim = index_tracer.get_node_chunk_dim(
-                    self.node_list[end_idx], end_dim, node
-                )
-                if mix_flow_node_dim is None:
-                    flow_block = True
-                    break
-                if _get_node_shape(mix_flow_node)[mix_flow_node_dim] == 1:
-                    flow_block = False
-                    for i in self._get_same_flow_node(
-                        chunk_info["inputs"], mix_flow_node
-                    ):
-                        chunk_info["inputs"].remove(i)
-                # else, we need to chunk mix var as well
-                else:
-                    # TODO chunk another value
-                    flow_block = True
-                    break
-            else:
-                raise NotImplementedError("%s not implemented" % node.name)
-
-        if flow_block:
-            flow_block = True
-            return flow_block, chunk_info
-
-        inputs_dim = []
-        remove_inputs = []
-        for input_node in chunk_info["inputs"]:
-            input_dict = {}
-            for user in input_node.users.keys():
-                if _is_non_compute_node(user):
-                    continue
-                user_idx = _find_idx_by_name(user.name, self.node_list)
-                dim = None
-                if start_dim <= user_idx < end_idx:
-                    dim = index_tracer.get_node_chunk_dim(
-                        self.node_list[end_idx], end_dim, input_node
-                    )
-                elif user_idx == end_idx:
-                    dim = end_dim
-                # n has relation with chunk dim
-                if dim is not None and _get_node_shape(user)[dim] != 1:
-                    input_dict[user_idx] = dim
-            if len(input_dict) == 0:
-                remove_inputs.append(input_node)
-            else:
-                inputs_dim.append(input_dict)
-        chunk_info["inputs_dim"] = inputs_dim
-        for i in remove_inputs:
-            if i in chunk_info["inputs"]:
-                chunk_info["inputs"].remove(i)
-
-        # we need to log input nodes to avoid deleteing them in the loop
-        non_chunk_inputs = _find_chunk_all_input_nodes(
-            self.node_list[start_idx : end_idx + 1]
-        )
-        for i in non_chunk_inputs:
-            if i not in chunk_info["inputs"]:
-                chunk_info["inputs_non_chunk"].append(i)
-
-        return flow_block, chunk_info
-
-
 class IndexTracer(object):
     def __init__(self, gm) -> None:
         self.gm = gm
@@ -932,6 +726,213 @@ def check_index_duplicate(self, chunk_infos):
         return True
 
 
+
+class FlowTracer(object):
+    def __init__(self, gm) -> None:
+        self.gm = gm
+        self.node_list = list(gm.graph.nodes)
+        self.flow_trace = {}
+
+    def _add_trace(self, name):
+        self.flow_trace[name] = []
+
+    def _add_node(self, trace_name, node):
+        self.flow_trace[trace_name].append(
+            {"node": node, "inside_depend": [], "outside_depend": []}
+        )
+
+    def _add_inside_depend(self, flow_name, node, inside_depend_node):
+        for i in self.flow_trace[flow_name]:
+            if i["node"] == node:
+                i["inside_depend"].append(inside_depend_node)
+                return
+        raise RuntimeError("node not found")
+
+    def _add_outside_depend(
+        self, flow_name, node, outside_depend_node, outside_depend_trace
+    ):
+        for i in self.flow_trace[flow_name]:
+            if i["node"] == node:
+                i["outside_depend"].append({outside_depend_trace: outside_depend_node})
+                return
+        raise RuntimeError("node not found")
+
+    def _init_trace(self):
+        for i in self.node_list:
+            if i.op == "placeholder":
+                self._add_trace(i.name)
+                self._add_node(i.name, i)
+
+    def _find_flow_for_node(self, node):
+        if type(self.node_list[0]) != type(node):
+            return None
+        if _is_non_compute_node_except_placeholder(node):
+            return None
+        for name, trace in self.flow_trace.items():
+            for i in trace:
+                if node == i["node"]:
+                    return name
+        if any(i in node.name for i in ["ones_like"]):
+            self._add_trace(node.name)
+            self._add_node(node.name, node)
+            return node.name
+        raise RuntimeError("node not found")
+
+    def _find_first_valid_flow(self, flow):
+        for i in flow:
+            if i is not None:
+                return i
+        raise RuntimeError("invalid flow")
+
+    def find_node_flow(self, node):
+        for name, trace in self.flow_trace.items():
+            for i in trace:
+                if node == i["node"]:
+                    return name, i
+        raise RuntimeError("invalid node")
+
+    def _get_flow_mix_node(self, node):
+        if _is_non_compute_node(node):
+            return None
+        _, node_trace = self.find_node_flow(node)
+        if len(node_trace["outside_depend"]) == 0:
+            return None
+        elif len(node_trace["outside_depend"]) > 1:
+            raise NotImplementedError
+        vars = list(node_trace["outside_depend"][0].values())[0]
+        return vars
+
+    def _get_same_flow_node(self, node_list, node):
+        name, _ = self.find_node_flow(node)
+        result = []
+        for i in self.flow_trace[name]:
+            if i["node"] in node_list:
+                result.append(i["node"])
+        return result
+
+    def trace_flow(self):
+        # init trace
+        self._init_trace()
+
+        for node in self.node_list:
+            # skip if non compute node
+            if all(
+                type(arg) != type(node) or _is_non_compute_node_except_placeholder(arg)
+                for arg in node.args
+            ) or _is_non_compute_node(node):
+                continue
+
+            node_input_flows = [self._find_flow_for_node(arg) for arg in node.args]
+
+            node_domin_flow = self._find_first_valid_flow(node_input_flows)
+            self._add_node(node_domin_flow, node)
+            for node_input_flow, arg in zip(node_input_flows, node.args):
+                if node_input_flow is None:
+                    continue
+                elif node_input_flow == node_domin_flow:
+                    self._add_inside_depend(node_domin_flow, node, arg)
+                else:
+                    self._add_outside_depend(
+                        node_domin_flow, node, arg, node_input_flow
+                    )
+        return self.flow_trace
+
+    def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer: IndexTracer):
+        inputs, outputs = _find_chunk_compute_input_and_output_nodes(
+            self.node_list[start_idx : end_idx + 1]
+        )
+        chunk_info = {
+            "region": (start_idx, end_idx),
+            "inputs": inputs,
+            "inputs_non_chunk": [],
+            "inputs_dim": start_dim,
+            "outputs": outputs,
+            "outputs_dim": end_dim,
+            "args": {},
+        }
+        flow_block = False
+
+        # TODO don't allow multi outputs now
+        if len(outputs) > 1:
+            flow_block = True
+            return flow_block, chunk_info
+
+        for idx in range(start_idx, end_idx + 1):
+            node = self.node_list[idx]
+            mix_flow_node = self._get_flow_mix_node(node)
+            if mix_flow_node is None:
+                continue
+
+            # if there is a flow mix, op must be in [mul, add, matmul]
+            # element-wise op requires dim to be equal in every dim
+            if any(n in node.name for n in ["mul", "add"]):
+                for i in node.args:
+                    if type(i) == type(mix_flow_node) and i != mix_flow_node:
+                        main_flow_var = i
+                # if mix flow is a broadcast in chunk dim,
+                # TODO: need to move that flow out of the chunk
+                mix_flow_node_dim = index_tracer.get_node_chunk_dim(
+                    self.node_list[end_idx], end_dim, node
+                )
+                if mix_flow_node_dim is None:
+                    flow_block = True
+                    break
+                if _get_node_shape(mix_flow_node)[mix_flow_node_dim] == 1:
+                    flow_block = False
+                    for i in self._get_same_flow_node(
+                        chunk_info["inputs"], mix_flow_node
+                    ):
+                        chunk_info["inputs"].remove(i)
+                # else, we need to chunk mix var as well
+                else:
+                    # TODO chunk another value
+                    flow_block = True
+                    break
+            else:
+                raise NotImplementedError("%s not implemented" % node.name)
+
+        if flow_block:
+            flow_block = True
+            return flow_block, chunk_info
+
+        inputs_dim = []
+        remove_inputs = []
+        for input_node in chunk_info["inputs"]:
+            input_dict = {}
+            for user in input_node.users.keys():
+                if _is_non_compute_node(user):
+                    continue
+                user_idx = _find_idx_by_name(user.name, self.node_list)
+                dim = None
+                if start_dim <= user_idx < end_idx:
+                    dim = index_tracer.get_node_chunk_dim(
+                        self.node_list[end_idx], end_dim, input_node
+                    )
+                elif user_idx == end_idx:
+                    dim = end_dim
+                # n has relation with chunk dim
+                if dim is not None and _get_node_shape(user)[dim] != 1:
+                    input_dict[user_idx] = dim
+            if len(input_dict) == 0:
+                remove_inputs.append(input_node)
+            else:
+                inputs_dim.append(input_dict)
+        chunk_info["inputs_dim"] = inputs_dim
+        for i in remove_inputs:
+            if i in chunk_info["inputs"]:
+                chunk_info["inputs"].remove(i)
+
+        # we need to log input nodes to avoid deleteing them in the loop
+        non_chunk_inputs = _find_chunk_all_input_nodes(
+            self.node_list[start_idx : end_idx + 1]
+        )
+        for i in non_chunk_inputs:
+            if i not in chunk_info["inputs"]:
+                chunk_info["inputs_non_chunk"].append(i)
+
+        return flow_block, chunk_info
+
+
 class MemoryEstimator(object):
     def __init__(self, index_tracer: IndexTracer) -> None:
         self.index_tracer = index_tracer

From d361d533e8e7773d2009cc4ff5a82633401ab44a Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Wed, 21 Dec 2022 15:01:03 +0800
Subject: [PATCH 044/209] refactor flow tracer

---
 chunk_codegen.py       | 283 +++++++++++++++++++++++++++++++++--------
 evoformer/evoformer.py |  11 +-
 2 files changed, 240 insertions(+), 54 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 2c1c09ae5238..3ba082ceb845 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -139,7 +139,13 @@ def _add_source(self, node_from, node_from_dim, node_to, node_to_dim, init=False
         node_from_idx = _find_idx_by_name(node_from.name, self.nodes_list)
         if init:
             node_to_trace["source"][node_to_dim] = {}
-        node_to_trace["source"][node_to_dim][node_from_idx] = node_from_dim
+        # add dim to cur new source
+        if node_from_idx not in node_to_trace["source"][node_to_dim]:
+            node_to_trace["source"][node_to_dim][node_from_idx] = [node_from_dim]
+        else:
+            if node_from_dim not in node_to_trace["source"][node_to_dim][node_from_idx]:
+                node_to_trace["source"][node_to_dim][node_from_idx].append(node_from_dim)
+        # update inputs source
         node_to_trace["source"][node_to_dim].update(
             node_from_trace["source"][node_from_dim]
         )
@@ -654,7 +660,7 @@ def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node
             end_node_trace_source.items(), key=lambda d: d[0], reverse=True
         )
         for node_idx, node_dim in sorted_source:
-            if node_idx == start_node_idx and node_dim == start_dim:
+            if node_idx == start_node_idx and start_dim in node_dim:
                 return True
             # it means we meet a node outside the loop, and the node is not input node
             if node_idx < start_idx:
@@ -694,12 +700,12 @@ def _find_inherit_dim(self, input_node, input_dim, node):
         for node_dim in range(len(_get_node_shape(node))):
             if (
                 input_node_idx in node_trace_source[node_dim]
-                and node_trace_source[node_dim][input_node_idx] == input_dim
+                and input_dim in node_trace_source[node_dim][input_node_idx]
             ):
                 return node_dim
         return None
 
-    def check_index_duplicate(self, chunk_infos):
+    def check_index_duplicate(self, chunk_infos, return_dim=False):
         input_dim_after_node = {}
         for input_node_idx, input_node in enumerate(chunk_infos["inputs"]):
             for k, v in chunk_infos["inputs_dim"][input_node_idx].items():
@@ -713,17 +719,30 @@ def check_index_duplicate(self, chunk_infos):
             if _is_non_compute_node_except_placeholder(node):
                 continue
             count = 0
+            duplicate_dims = []
             node_trace_source = self._find_source_trace_from_node(node)
             for node_dim in range(len(_get_node_shape(node))):
+                duplicate_dim = []
+                duplicate_flag = False
                 dim_source = node_trace_source[node_dim]
                 for k, v in dim_source.items():
                     if chunk_infos["region"][0] <= k <= chunk_infos["region"][1]:
-                        if k in input_dim_after_node and input_dim_after_node[k] == v:
-                            count += 1
-                            break
+                        if k in input_dim_after_node and input_dim_after_node[k] in v:
+                            duplicate_flag = True
+                            duplicate_dim.append((k, v))
+                duplicate_dims.append(duplicate_dim)
+                if duplicate_flag:
+                    count += 1
+
             if count > 1:
-                return False
-        return True
+                if return_dim:
+                    return False, duplicate_dims
+                else:
+                    return False
+        if return_dim:
+            return True, None
+        else:
+            return True
 
 
@@ -857,43 +876,45 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer: Ind
             flow_block = True
             return flow_block, chunk_info
 
-        for idx in range(start_idx, end_idx + 1):
-            node = self.node_list[idx]
-            mix_flow_node = self._get_flow_mix_node(node)
-            if mix_flow_node is None:
-                continue
-
-            # if there is a flow mix, op must be in [mul, add, matmul]
-            # element-wise op requires dim to be equal in every dim
-            if any(n in node.name for n in ["mul", "add"]):
-                for i in node.args:
-                    if type(i) == type(mix_flow_node) and i != mix_flow_node:
-                        main_flow_var = i
-                # if mix flow is a broadcast in chunk dim,
-                # TODO: need to move that flow out of the chunk
-                mix_flow_node_dim = index_tracer.get_node_chunk_dim(
-                    self.node_list[end_idx], end_dim, node
-                )
-                if mix_flow_node_dim is None:
-                    flow_block = True
-                    break
-                if _get_node_shape(mix_flow_node)[mix_flow_node_dim] == 1:
-                    flow_block = False
-                    for i in self._get_same_flow_node(
-                        chunk_info["inputs"], mix_flow_node
-                    ):
-                        chunk_info["inputs"].remove(i)
-                # else, we need to chunk mix var as well
-                else:
-                    # TODO chunk another value
-                    flow_block = True
-                    break
-            else:
-                raise NotImplementedError("%s not implemented" % node.name)
-
-        if flow_block:
-            flow_block = True
-            return flow_block, chunk_info
+        # for idx in range(start_idx, end_idx + 1):
+        #     node = self.node_list[idx]
+        #     mix_flow_node = self._get_flow_mix_node(node)
+        #     if mix_flow_node is None:
+        #         continue
+
+        #     # if there is a flow mix, op must be in [mul, add, matmul]
+        #     # element-wise op requires dim to be equal in every dim
+        #     if any(n in node.name for n in ["mul", "add"]):
+        #         for i in node.args:
+        #             if type(i) == type(mix_flow_node) and i != mix_flow_node:
+        #                 main_flow_var = i
+        #         # if mix flow is a broadcast in chunk dim,
+        #         # TODO: need to move that flow out of the chunk
+        #         mix_flow_node_dim = index_tracer.get_node_chunk_dim(
+        #             self.node_list[end_idx], end_dim, node
+        #         )
+        #         # TODO: we need to loop every dim
+        #         if isinstance(mix_flow_node_dim, list):
+        #             mix_flow_node_dim = mix_flow_node_dim[0]
+        #         if mix_flow_node_dim is None:
+        #             flow_block = True
+        #             break
+        #         if _get_node_shape(mix_flow_node)[mix_flow_node_dim] == 1:
+        #             flow_block = False
+        #             for i in self._get_same_flow_node(
+        #                 chunk_info["inputs"], mix_flow_node
+        #             ):
+        #                 chunk_info["inputs"].remove(i)
+        #         # else, we need to chunk mix var as well
+        #         else:
+        #             # TODO chunk another value
+        #             flow_block = True
+        #             break
+        #     else:
+        #         raise NotImplementedError("%s not implemented" % node.name)
+        # if flow_block:
+        #     flow_block = True
+        #     return flow_block, chunk_info
 
         inputs_dim = []
         remove_inputs = []
@@ -908,6 +929,9 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer: Ind
                     dim = index_tracer.get_node_chunk_dim(
                         self.node_list[end_idx], end_dim, input_node
                     )
+                    # TODO: we need to loop every dim
+                    if isinstance(dim, list):
+                        dim = dim[0]
                 elif user_idx == end_idx:
                     dim = end_dim
                 # n has relation with chunk dim
@@ -921,6 +945,8 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer: Ind
         for i in remove_inputs:
             if i in chunk_info["inputs"]:
                 chunk_info["inputs"].remove(i)
+        
+        duplicate_result, duplicate_dim = index_tracer.check_index_duplicate(chunk_info, return_dim=True)
 
         # we need to log input nodes to avoid deleteing them in the loop
         non_chunk_inputs = _find_chunk_all_input_nodes(
@@ -932,6 +958,150 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer: Ind
 
         return flow_block, chunk_info
 
+    def _assgin_single_node_flow(self, arg_node, start_idx, end_idx, 
+                                 inputs, index_tracer, cur_node_dim, 
+                                 cur_node_compute, cur_node_source, cur_node_fix_dim, all_node_info,
+                                 next_node_list):
+        arg_idx = _find_idx_by_name(arg_node.name, index_tracer.nodes_list)
+        # arg in chunk range or be inputs
+        if not (start_idx <= arg_idx < end_idx):
+            return True
+                    
+        # find arg dim
+        if cur_node_dim is not None:
+            # dim is computed
+            if arg_idx in cur_node_compute[cur_node_dim]:
+                return False
+            if arg_idx not in cur_node_source[cur_node_dim]:
+                arg_dim = None
+            else:
+                arg_dim = cur_node_source[cur_node_dim][arg_idx][0]
+        else:
+            arg_dim = None
+                    
+        # get fix dim
+        arg_fix_dim = []
+        if cur_node_dim is not None:
+            for i in cur_node_fix_dim:
+                fix_dim_source = cur_node_source[i]
+                if arg_idx in fix_dim_source:
+                    arg_fix_dim.append(fix_dim_source[arg_idx][0])
+                    
+        # if already in node_info, arg dim must be same
+        if arg_node in all_node_info:
+            if all_node_info[arg_node] != arg_dim:
+                return False
+            all_node_info[arg_node]['fix_dim'] = list(set(all_node_info[arg_node]['fix_dim'] + arg_fix_dim))
+        # else add it to list
+        else:
+            all_node_info[arg_node] = {'chunk_dim': arg_dim, 'fix_dim': arg_fix_dim}
+                        
+        next_node_list.append(arg_node)
+        return True
+    
+    def flow_search(self, start_idx, start_dim, end_idx, end_dim, index_tracer: IndexTracer):
+        inputs, outputs = _find_chunk_compute_input_and_output_nodes(
+            self.node_list[start_idx : end_idx + 1]
+        )
+        # only single ouput
+        if len(outputs) > 1:
+            return None
+        
+        cur_node_list = [index_tracer.nodes_list[end_idx]]  # start from the last node
+        all_node_info = {cur_node_list[0]: {'chunk_dim': end_dim, 'fix_dim': []}}
+        
+        while len(cur_node_list) > 0:
+            next_node_list = []
+
+            for cur_node in cur_node_list:
+                # get cur node info
+                cur_node_chunk_dim = all_node_info[cur_node]['chunk_dim']
+                cur_node_fix_dim = all_node_info[cur_node]['fix_dim']
+                cur_node_idx = _find_idx_by_name(cur_node.name, index_tracer.nodes_list)
+                if cur_node_chunk_dim:
+                    cur_node_compute = index_tracer._find_compute_trace_from_node(cur_node)
+                    cur_node_source = index_tracer._find_source_trace_from_node(cur_node)
+                else:
+                    cur_node_compute = cur_node_source = None
+                
+                # get all valid args
+                arg_list = []
+                for arg in cur_node.args:
+                    if type(arg) != type(cur_node):
+                        continue
+                    if _is_non_compute_node(arg):
+                        continue
+                    arg_list.append(arg)
+                    flow_flag = self._assgin_single_node_flow(arg, start_idx, end_idx, 
+                        inputs, index_tracer, cur_node_chunk_dim, 
+                        cur_node_compute, cur_node_source, cur_node_fix_dim, all_node_info,
+                        next_node_list)
+                    if flow_flag == False:
+                        return None
+                        
+                if len(arg_list) == 2:
+                    if any(i in cur_node.name for i in ["add", "mul"]):
+                        for arg in arg_list:
+                            if not (start_idx <= _find_idx_by_name(arg.name, index_tracer.nodes_list) < end_idx):
+                                continue
+                            arg_chunk_dim = all_node_info[arg]['chunk_dim']
+                            arg_fix_dim = all_node_info[arg]['fix_dim']
+                            arg_shape = _get_node_shape(arg)
+                            # add all dim as fix dim except chunk dim
+                            for i, shape in enumerate(arg_shape):
+                                if shape != 1 and i != cur_node_chunk_dim:
+                                    if i == arg_chunk_dim:
+                                        return None
+                                    if i not in arg_fix_dim:
+                                        arg_fix_dim.append(i)
+                    elif "einsum" in cur_node.name:
+                        pass
+                    elif "matmul" in cur_node.name:
+                        pass
+                    else:
+                        raise NotImplementedError()
+            cur_node_list = next_node_list
+        
+        inputs_dim = []
+        remove_inputs = []
+        for input_node in inputs:
+            input_dict = {}
+            for user in input_node.users.keys():
+                if _is_non_compute_node(user):
+                    continue
+                user_idx = _find_idx_by_name(user.name, self.node_list)
+                if start_idx <= user_idx <= end_idx:
+                    chunk_dim = all_node_info[user]['chunk_dim']
+                    if chunk_dim is not None:
+                        input_dict[user_idx] = chunk_dim
+            if len(input_dict) == 0:
+                remove_inputs.append(input_node)
+            else:
+                inputs_dim.append(input_dict)
+        for i in remove_inputs:
+            if i in inputs:
+                inputs.remove(i)
+        
+        chunk_info = {
+            "region": (start_idx, end_idx),
+            "inputs": inputs,
+            "inputs_non_chunk": [],
+            "inputs_dim": inputs_dim,
+            "outputs": outputs,
+            "outputs_dim": end_dim,
+            "args": {},
+        }
+        
+        # we need to log input nodes to avoid deleteing them in the loop
+        non_chunk_inputs = _find_chunk_all_input_nodes(
+            self.node_list[start_idx : end_idx + 1]
+        )
+        for i in non_chunk_inputs:
+            if i not in chunk_info["inputs"]:
+                chunk_info["inputs_non_chunk"].append(i)
+
+        return chunk_info
+
 
 class MemoryEstimator(object):
     def __init__(self, index_tracer: IndexTracer) -> None:
@@ -1055,12 +1225,13 @@ def _get_chunk_ratio(self, node, chunk_inputs, chunk_inputs_dim, chunk_size):
         node_source = self.index_tracer._find_source_trace_from_node(node)
         for (input_node, input_node_dim) in zip(chunk_inputs, chunk_inputs_dim):
             for k, v in input_node_dim.items():
+                # TODO: inherit dim should be list too, int now
                 inherit_dim = self.index_tracer._find_inherit_dim(input_node, v, self.index_tracer.nodes_list[k])
                 if k == _find_idx_by_name(node.name, self.index_tracer.nodes_list):
                     chunk_ratio = float(chunk_size) / node_shape[inherit_dim]
                     return chunk_ratio
                 for dim, source in enumerate(node_source):
-                    if k in source and source[k] == inherit_dim:
+                    if k in source and inherit_dim in source[k]:
                         chunk_ratio = float(chunk_size) / node_shape[dim]
                         return chunk_ratio
         return 1.
@@ -1323,9 +1494,11 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
                 continue
             for start_node, start_trace in start_traces.items():
                 for start_dim, start_trace_idx in enumerate(start_trace["idx"]):
-                    # must be same trace idx
-                    if start_trace_idx != end_trace_idx:
-                        continue
+                    if start_idx == 199 and end_idx == 229 and start_dim == 2 and end_dim == 2:
+                        print(1)
+                        self.flow_tracer.flow_search(
+                            start_idx, start_dim, end_idx, end_dim, self.index_tracer
+                        )
                     # dim size cannot be 1
                     if (
                         _get_node_shape(end_node)[end_dim] == 1
@@ -1343,10 +1516,16 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
                     ):
                         continue
                     # detect flow meet
-                    flow_block, chunk_info = self.flow_tracer._detect_flow(
+                    # flow_block, chunk_info = self.flow_tracer._detect_flow(
+                    #     start_idx, start_dim, end_idx, end_dim, self.index_tracer
+                    # )
+                    # if flow_block:
+                    #     continue
+                    # flow search
+                    chunk_info = self.flow_tracer.flow_search(
                         start_idx, start_dim, end_idx, end_dim, self.index_tracer
                     )
-                    if flow_block:
+                    if chunk_info is None:
                         continue
                     # check index copmute
                     if not self.index_tracer.check_index_duplicate(chunk_info):
diff --git a/evoformer/evoformer.py b/evoformer/evoformer.py
index 0c5ab952a779..cfd2bb2a2529 100644
--- a/evoformer/evoformer.py
+++ b/evoformer/evoformer.py
@@ -6,6 +6,13 @@
 from .triangle import PairStack
 
 
+def print_memory(init_mem, text=None):
+    now_mem = torch.cuda.memory_allocated() / 1024 ** 2 - init_mem
+    max_mem = torch.cuda.max_memory_allocated() / 1024 ** 2 - init_mem
+    print("%s now:%.2f max:%.2f" % ("" if text is None else text, now_mem, max_mem))
+    torch.cuda.reset_peak_memory_stats()
+
+
 class EvoformerBlock(nn.Module):
 
     def __init__(self, d_node, d_pair):
@@ -16,9 +23,9 @@ def __init__(self, d_node, d_pair):
         self.pair_stack = PairStack(d_pair=d_pair)
 
     def forward(self, node, pair):
-        node = node + self.msa_stack(node, pair)
+        node = self.msa_stack(node, pair)
         pair = pair + self.communication(node)
-        pair = pair + self.pair_stack(pair)
+        pair = self.pair_stack(pair)
         return node, pair
 
 
From ded1005667402ee9458afa53852ce2018b1ccb10 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Wed, 21 Dec 2022 15:03:08 +0800
Subject: [PATCH 045/209] format code

---
 chunk_codegen.py | 184 +++++++++++++++++++++++++++++++----------------
 1 file changed, 122 insertions(+), 62 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 3ba082ceb845..eb16361c04fc 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -144,7 +144,9 @@ def _add_source(self, node_from, node_from_dim, node_to, node_to_dim, init=False
             node_to_trace["source"][node_to_dim][node_from_idx] = [node_from_dim]
         else:
             if node_from_dim not in node_to_trace["source"][node_to_dim][node_from_idx]:
-                node_to_trace["source"][node_to_dim][node_from_idx].append(node_from_dim)
+                node_to_trace["source"][node_to_dim][node_from_idx].append(
+                    node_from_dim
+                )
         # update inputs source
         node_to_trace["source"][node_to_dim].update(
             node_from_trace["source"][node_from_dim]
@@ -745,7 +747,6 @@ def check_index_duplicate(self, chunk_infos, return_dim=False):
             return True
 
 
-
 class FlowTracer(object):
     def __init__(self, gm) -> None:
         self.gm = gm
@@ -856,7 +857,9 @@ def trace_flow(self):
                     )
         return self.flow_trace
 
-    def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer: IndexTracer):
+    def _detect_flow(
+        self, start_idx, start_dim, end_idx, end_dim, index_tracer: IndexTracer
+    ):
         inputs, outputs = _find_chunk_compute_input_and_output_nodes(
             self.node_list[start_idx : end_idx + 1]
         )
@@ -945,8 +948,10 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer: Ind
         for i in remove_inputs:
             if i in chunk_info["inputs"]:
                 chunk_info["inputs"].remove(i)
-        
-        duplicate_result, duplicate_dim = index_tracer.check_index_duplicate(chunk_info, return_dim=True)
+
+        duplicate_result, duplicate_dim = index_tracer.check_index_duplicate(
+            chunk_info, return_dim=True
+        )
 
         # we need to log input nodes to avoid deleteing them in the loop
         non_chunk_inputs = _find_chunk_all_input_nodes(
@@ -958,15 +963,25 @@ def _detect_flow(self, start_idx, start_dim, end_idx, end_dim, index_tracer: Ind
 
         return flow_block, chunk_info
 
-    def _assgin_single_node_flow(self, arg_node, start_idx, end_idx, 
-                                 inputs, index_tracer, cur_node_dim, 
-                                 cur_node_compute, cur_node_source, cur_node_fix_dim, all_node_info,
-                                 next_node_list):
+    def _assgin_single_node_flow(
+        self,
+        arg_node,
+        start_idx,
+        end_idx,
+        inputs,
+        index_tracer,
+        cur_node_dim,
+        cur_node_compute,
+        cur_node_source,
+        cur_node_fix_dim,
+        all_node_info,
+        next_node_list,
+    ):
         arg_idx = _find_idx_by_name(arg_node.name, index_tracer.nodes_list)
         # arg in chunk range or be inputs
         if not (start_idx <= arg_idx < end_idx):
             return True
-                    
+
         # find arg dim
         if cur_node_dim is not None:
             # dim is computed
@@ -978,7 +993,7 @@ def _assgin_single_node_flow(self, arg_node, start_idx, end_idx,
                 arg_dim = cur_node_source[cur_node_dim][arg_idx][0]
         else:
             arg_dim = None
-                    
+
         # get fix dim
         arg_fix_dim = []
         if cur_node_dim is not None:
@@ -986,44 +1001,52 @@ def _assgin_single_node_flow(self, arg_node, start_idx, end_idx,
                 fix_dim_source = cur_node_source[i]
                 if arg_idx in fix_dim_source:
                     arg_fix_dim.append(fix_dim_source[arg_idx][0])
-                    
+
         # if already in node_info, arg dim must be same
         if arg_node in all_node_info:
             if all_node_info[arg_node] != arg_dim:
                 return False
-            all_node_info[arg_node]['fix_dim'] = list(set(all_node_info[arg_node]['fix_dim'] + arg_fix_dim))
+            all_node_info[arg_node]["fix_dim"] = list(
+                set(all_node_info[arg_node]["fix_dim"] + arg_fix_dim)
+            )
         # else add it to list
         else:
-            all_node_info[arg_node] = {'chunk_dim': arg_dim, 'fix_dim': arg_fix_dim}
-                        
+            all_node_info[arg_node] = {"chunk_dim": arg_dim, "fix_dim": arg_fix_dim}
+
         next_node_list.append(arg_node)
         return True
-    
-    def flow_search(self, start_idx, start_dim, end_idx, end_dim, index_tracer: IndexTracer):
+
+    def flow_search(
+        self, start_idx, start_dim, end_idx, end_dim, index_tracer: IndexTracer
+    ):
         inputs, outputs = _find_chunk_compute_input_and_output_nodes(
             self.node_list[start_idx : end_idx + 1]
         )
         # only single ouput
         if len(outputs) > 1:
             return None
-        
+
         cur_node_list = [index_tracer.nodes_list[end_idx]]  # start from the last node
-        all_node_info = {cur_node_list[0]: {'chunk_dim': end_dim, 'fix_dim': []}}
-        
+        all_node_info = {cur_node_list[0]: {"chunk_dim": end_dim, "fix_dim": []}}
+
         while len(cur_node_list) > 0:
             next_node_list = []
 
             for cur_node in cur_node_list:
                 # get cur node info
-                cur_node_chunk_dim = all_node_info[cur_node]['chunk_dim']
-                cur_node_fix_dim = all_node_info[cur_node]['fix_dim']
+                cur_node_chunk_dim = all_node_info[cur_node]["chunk_dim"]
+                cur_node_fix_dim = all_node_info[cur_node]["fix_dim"]
                 cur_node_idx = _find_idx_by_name(cur_node.name, index_tracer.nodes_list)
                 if cur_node_chunk_dim:
-                    cur_node_compute = index_tracer._find_compute_trace_from_node(cur_node)
-                    cur_node_source = index_tracer._find_source_trace_from_node(cur_node)
+                    cur_node_compute = index_tracer._find_compute_trace_from_node(
+                        cur_node
+                    )
+                    cur_node_source = index_tracer._find_source_trace_from_node(
+                        cur_node
+                    )
                 else:
                     cur_node_compute = cur_node_source = None
-                
+
                 # get all valid args
                 arg_list = []
                 for arg in cur_node.args:
@@ -1032,20 +1055,33 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim, index_tracer: Inde
                     if _is_non_compute_node(arg):
                         continue
                     arg_list.append(arg)
-                    flow_flag = self._assgin_single_node_flow(arg, start_idx, end_idx, 
-                        inputs, index_tracer, cur_node_chunk_dim, 
-                        cur_node_compute, cur_node_source, cur_node_fix_dim, all_node_info,
-                        next_node_list)
+                    flow_flag = self._assgin_single_node_flow(
+                        arg,
+                        start_idx,
+                        end_idx,
+                        inputs,
+                        index_tracer,
+                        cur_node_chunk_dim,
+                        cur_node_compute,
+                        cur_node_source,
+                        cur_node_fix_dim,
+                        all_node_info,
+                        next_node_list,
+                    )
                     if flow_flag == False:
                         return None
-                        
+
                 if len(arg_list) == 2:
                     if any(i in cur_node.name for i in ["add", "mul"]):
                         for arg in arg_list:
-                            if not (start_idx <= _find_idx_by_name(arg.name, index_tracer.nodes_list) < end_idx):
+                            if not (
+                                start_idx
+                                <= _find_idx_by_name(arg.name, index_tracer.nodes_list)
+                                < end_idx
+                            ):
                                 continue
-                            arg_chunk_dim = all_node_info[arg]['chunk_dim']
-                            arg_fix_dim = all_node_info[arg]['fix_dim']
+                            arg_chunk_dim = all_node_info[arg]["chunk_dim"]
+                            arg_fix_dim = all_node_info[arg]["fix_dim"]
                             arg_shape = _get_node_shape(arg)
                             # add all dim as fix dim except chunk dim
                             for i, shape in enumerate(arg_shape):
@@ -1061,7 +1097,7 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim, index_tracer: Inde
                     else:
                         raise NotImplementedError()
             cur_node_list = next_node_list
-        
+
         inputs_dim = []
         remove_inputs = []
         for input_node in inputs:
@@ -1071,7 +1107,7 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim, index_tracer: Inde
                     continue
                 user_idx = _find_idx_by_name(user.name, self.node_list)
                 if start_idx <= user_idx <= end_idx:
-                    chunk_dim = all_node_info[user]['chunk_dim']
+                    chunk_dim = all_node_info[user]["chunk_dim"]
                     if chunk_dim is not None:
                         input_dict[user_idx] = chunk_dim
             if len(input_dict) == 0:
@@ -1081,7 +1117,7 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim, index_tracer: Inde
         for i in remove_inputs:
             if i in inputs:
                 inputs.remove(i)
-        
+
         chunk_info = {
             "region": (start_idx, end_idx),
             "inputs": inputs,
@@ -1091,7 +1127,7 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim, index_tracer: Inde
             "outputs_dim": end_dim,
             "args": {},
         }
-        
+
         # we need to log input nodes to avoid deleteing them in the loop
         non_chunk_inputs = _find_chunk_all_input_nodes(
             self.node_list[start_idx : end_idx + 1]
@@ -1129,7 +1165,7 @@ def _get_output_node_size(self, n):
 
     def _add_active_node(self, n, active_list):
         new_active = self._get_output_node(n)[1]
-        if n.op == 'placeholder':
+        if n.op == "placeholder":
             new_active.append(n.name)
         for i in new_active:
             if i not in active_list:
@@ -1168,12 +1204,16 @@ def _remove_deactive_node(self, user, user_to_last_uses, active_list):
         for i in delete_node:
             if i in active_list:
                 active_list.remove(i)
-    
-    def _get_chunk_inputs_size(self, chunk_inputs, chunk_inputs_non_chunk, node_list, chunk_end_idx):
+
+    def _get_chunk_inputs_size(
+        self, chunk_inputs, chunk_inputs_non_chunk, node_list, chunk_end_idx
+    ):
         nodes_to_delete = []
         for chunk_input in chunk_inputs + chunk_inputs_non_chunk:
             chunk_input_users = chunk_input.users.keys()
-            chunk_input_users_idx = [_find_idx_by_name(i.name, node_list) for i in chunk_input_users]
+            chunk_input_users_idx = [
+                _find_idx_by_name(i.name, node_list) for i in chunk_input_users
+            ]
             if all(i <= chunk_end_idx for i in chunk_input_users_idx):
                 if chunk_input not in nodes_to_delete:
                     nodes_to_delete.append(chunk_input)
@@ -1226,7 +1266,9 @@ def _get_chunk_ratio(self, node, chunk_inputs, chunk_inputs_dim, chunk_size):
         for (input_node, input_node_dim) in zip(chunk_inputs, chunk_inputs_dim):
             for k, v in input_node_dim.items():
                 # TODO: inherit dim should be list too, int now
-                inherit_dim = self.index_tracer._find_inherit_dim(input_node, v, self.index_tracer.nodes_list[k])
+                inherit_dim = self.index_tracer._find_inherit_dim(
+                    input_node, v, self.index_tracer.nodes_list[k]
+                )
                 if k == _find_idx_by_name(node.name, self.index_tracer.nodes_list):
                     chunk_ratio = float(chunk_size) / node_shape[inherit_dim]
                     return chunk_ratio
@@ -1234,7 +1276,7 @@ def _get_chunk_ratio(self, node, chunk_inputs, chunk_inputs_dim, chunk_size):
                     if k in source and inherit_dim in source[k]:
                         chunk_ratio = float(chunk_size) / node_shape[dim]
                         return chunk_ratio
-        return 1.
+        return 1.0
 
     def _get_chunk_delete_node_size(
         self, user, user_to_last_uses, chunk_ratio, chunk_inputs_names
@@ -1295,7 +1337,7 @@ def estimate_chunk_inference_mem(
         chunk_ratio = 1  # use it to estimate chunk mem
         chunk_size = 1
         chunk_inputs_names = []
-        
+
         if use_chunk:
             chunk_regions = [i["region"] for i in chunk_infos]
             chunk_starts = [i[0] for i in chunk_regions]
@@ -1313,12 +1355,17 @@ def estimate_chunk_inference_mem(
             if use_chunk and idx in chunk_starts:
                 chunk_within = True
                 chunk_region_idx = chunk_starts.index(idx)
-                act_memory += self._get_output_node_size(chunk_outputs[chunk_region_idx]) / (1024**2)
+                act_memory += self._get_output_node_size(
+                    chunk_outputs[chunk_region_idx]
+                ) / (1024**2)
 
             # determine chunk ratio for current node
             if chunk_within:
                 chunk_ratio = self._get_chunk_ratio(
-                    node, chunk_inputs[chunk_region_idx], chunk_inputs_dim[chunk_region_idx], chunk_size
+                    node,
+                    chunk_inputs[chunk_region_idx],
+                    chunk_inputs_dim[chunk_region_idx],
+                    chunk_size,
                 )
 
             # if node is placeholder, just add the size of the node
@@ -1353,18 +1400,18 @@ def estimate_chunk_inference_mem(
                     / (1024**2)
                 )
                 # delete unused vars not in chunk_input_list
-                # we can't delete input nodes until chunk ends 
+                # we can't delete input nodes until chunk ends
                 if chunk_within:
                     act_memory -= self._get_chunk_delete_node_size(
                         node,
                         user_to_last_uses_no_free_var,
                         chunk_ratio,
-                        chunk_inputs_names
+                        chunk_inputs_names,
                     ) / (1024**2)
                 else:
-                    act_memory -= (self._get_delete_node_size(
+                    act_memory -= self._get_delete_node_size(
                         node, user_to_last_uses_no_free_var, chunk_inputs_names
-                    ) / (1024**2))
+                    ) / (1024**2)
 
             # log active node, only effective without chunk
             self._add_active_node(node, active_node_list)
@@ -1376,11 +1423,11 @@ def estimate_chunk_inference_mem(
                     self._get_output_node_size(node) * chunk_ratio / (1024**2)
                 )
                 act_memory -= self._get_chunk_inputs_size(
-                    chunk_inputs[chunk_region_idx], 
-                    chunk_inputs_non_chunk[chunk_region_idx], 
+                    chunk_inputs[chunk_region_idx],
+                    chunk_inputs_non_chunk[chunk_region_idx],
                     node_list,
-                    chunk_regions[chunk_region_idx][1]
-                    ) / (1024**2)
+                    chunk_regions[chunk_region_idx][1],
+                ) / (1024**2)
                 chunk_within = False
                 chunk_ratio = 1
                 chunk_region_idx = None
@@ -1436,7 +1483,7 @@ def _search_max_chunk_region(self, active_node, peak_node, chunk_regions):
         active_node_num = [len(i) for i in active_node]
         min_active_node_num = min(active_node_num[free_var_num:])
         threshold = max(free_var_num, min_active_node_num)
-        
+
         # from peak_node to free_var
         inside_flag = False
         chunk_region_start = free_var_num
@@ -1494,7 +1541,12 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
                 continue
             for start_node, start_trace in start_traces.items():
                 for start_dim, start_trace_idx in enumerate(start_trace["idx"]):
-                    if start_idx == 199 and end_idx == 229 and start_dim == 2 and end_dim == 2:
+                    if (
+                        start_idx == 199
+                        and end_idx == 229
+                        and start_dim == 2
+                        and end_dim == 2
+                    ):
                         print(1)
                         self.flow_tracer.flow_search(
                             start_idx, start_dim, end_idx, end_dim, self.index_tracer
@@ -1576,7 +1628,7 @@ def _search_best_chunk_region(self, possible_chunk_regions, chunk_infos):
             max_region_range = 0
             best_region = None
         return best_region
-    
+
     def _is_legal_region(self, cur_chunk_info, chunk_infos):
         (chunk_region_start, chunk_region_end) = cur_chunk_info["region"]
         if cur_chunk_info in chunk_infos:
@@ -1585,11 +1637,13 @@ def _is_legal_region(self, cur_chunk_info, chunk_infos):
             return False
         for i in chunk_infos:
             region = i["region"]
-            if not ((chunk_region_start > region[1] and chunk_region_end > region[1]) 
-                    or (chunk_region_start < region[0] and chunk_region_end < region[0])):
+            if not (
+                (chunk_region_start > region[1] and chunk_region_end > region[1])
+                or (chunk_region_start < region[0] and chunk_region_end < region[0])
+            ):
                 return False
         return True
-    
+
     def _step_search(self, mem_peak, active_node, chunk_regions):
         peak_node = self._find_peak_node(mem_peak)
         max_chunk_region = self._search_max_chunk_region(
@@ -1600,7 +1654,9 @@ def _step_search(self, mem_peak, active_node, chunk_regions):
         possible_chunk_regions = self._search_possible_chunk_regions(
             max_chunk_region, peak_node
         )
-        best_chunk_region = self._search_best_chunk_region(possible_chunk_regions, chunk_regions)
+        best_chunk_region = self._search_best_chunk_region(
+            possible_chunk_regions, chunk_regions
+        )
         return best_chunk_region
 
     def _stop_search(self, init_mem_peak, mem_peak):
@@ -1667,7 +1723,11 @@ def _gen_loop_end(
     chunk_slice = _gen_chunk_slice_dim(
         chunk_outputs_dim, "chunk_idx", chunk_output_shape
     )
-    context = "    chunk_result%s = %s;  %s = None\n" % (chunk_slice, chunk_outputs_name, chunk_outputs_name)
+    context = "    chunk_result%s = %s;  %s = None\n" % (
+        chunk_slice,
+        chunk_outputs_name,
+        chunk_outputs_name,
+    )
     context += (
         chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None"
     )

From 774d34f1aa2f9534557dd4a0ca866392a496e448 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 23 Dec 2022 13:41:10 +0800
Subject: [PATCH 046/209] refactor flow search

---
 chunk_codegen.py | 78 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 58 insertions(+), 20 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index eb16361c04fc..0b0a164fe999 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1004,7 +1004,7 @@ def _assgin_single_node_flow(
 
         # if already in node_info, arg dim must be same
         if arg_node in all_node_info:
-            if all_node_info[arg_node] != arg_dim:
+            if all_node_info[arg_node]['chunk_dim'] != arg_dim:
                 return False
             all_node_info[arg_node]["fix_dim"] = list(
                 set(all_node_info[arg_node]["fix_dim"] + arg_fix_dim)
@@ -1128,14 +1128,68 @@ def flow_search(
             "args": {},
         }
 
+        # move useless nodes ahead of loop
+        # get all possible prepose nodes
+        maybe_prepose_nodes = []
+        for node, node_info in all_node_info.items():
+            if node_info['chunk_dim'] is None:
+                maybe_prepose_nodes.append(node)
+        maybe_prepose_nodes.sort(key=lambda x: _find_idx_by_name(x.name, index_tracer.nodes_list), reverse=True) # from last node to first node
+        prepose_nodes = []
+        # set every node as root, search its args, if all legal, turn root and args as prepose nodes
+        while len(maybe_prepose_nodes) > 0:
+            tmp_cur_prepose_nodes = [maybe_prepose_nodes[0]]
+            tmp_cur_related_prepose_nodes = []
+            prepose_flag = True
+            
+            # loop cur node's all arg until out of chunk
+            while len(tmp_cur_prepose_nodes) > 0:
+                tmp_next_prepose_nodes = []
+                tmp_cur_related_prepose_nodes.extend(tmp_cur_prepose_nodes)
+                for cur_prepose_node in tmp_cur_prepose_nodes:
+                    for cur_prepose_node_arg in cur_prepose_node.args:
+                        if type(cur_prepose_node_arg) != type(cur_prepose_node):
+                            continue
+                        # out of loop
+                        if not (start_idx <= _find_idx_by_name(cur_prepose_node_arg.name, self.node_list) < end_idx):
+                            continue
+                        # compute op in loop
+                        elif cur_prepose_node_arg in all_node_info:
+                            if all_node_info[cur_prepose_node_arg]['chunk_dim'] is None:
+                                tmp_next_prepose_nodes.append(cur_prepose_node_arg)
+                            else:
+                                prepose_flag = False
+                                break; break; break
+                        # non compute op
+                        else:
+                            tmp_next_prepose_nodes.append(cur_prepose_node_arg)
+                tmp_cur_prepose_nodes = tmp_next_prepose_nodes
+            
+            if prepose_flag == False:
+                maybe_prepose_nodes.remove(maybe_prepose_nodes[0])
+                continue
+            else:
+                for n in tmp_cur_related_prepose_nodes:
+                    if n not in prepose_nodes:
+                        prepose_nodes.append(n)
+                    if n in maybe_prepose_nodes:
+                        maybe_prepose_nodes.remove(n)
+        # sort by index
+        prepose_nodes.sort(key=lambda x: _find_idx_by_name(x.name, index_tracer.nodes_list))
+        chunk_info["args"]["prepose_nodes"] = prepose_nodes
+        
         # we need to log input nodes to avoid deleteing them in the loop
+        chunk_node_list = self.node_list[start_idx : end_idx + 1]
+        # also need to get some prepose node's arg out of non_chunk_inputs
+        for n in prepose_nodes:
+            chunk_node_list.remove(n)
         non_chunk_inputs = _find_chunk_all_input_nodes(
-            self.node_list[start_idx : end_idx + 1]
+            chunk_node_list
         )
         for i in non_chunk_inputs:
-            if i not in chunk_info["inputs"]:
+            if i not in chunk_info["inputs"] and i not in prepose_nodes:
                 chunk_info["inputs_non_chunk"].append(i)
-
+        
         return chunk_info
 
 
@@ -1541,16 +1595,6 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
                 continue
             for start_node, start_trace in start_traces.items():
                 for start_dim, start_trace_idx in enumerate(start_trace["idx"]):
-                    if (
-                        start_idx == 199
-                        and end_idx == 229
-                        and start_dim == 2
-                        and end_dim == 2
-                    ):
-                        print(1)
-                        self.flow_tracer.flow_search(
-                            start_idx, start_dim, end_idx, end_dim, self.index_tracer
-                        )
                     # dim size cannot be 1
                     if (
                         _get_node_shape(end_node)[end_dim] == 1
@@ -1567,12 +1611,6 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
                         start_idx, end_dim, end_node, end_idx
                     ):
                         continue
-                    # detect flow meet
-                    # flow_block, chunk_info = self.flow_tracer._detect_flow(
-                    #     start_idx, start_dim, end_idx, end_dim, self.index_tracer
-                    # )
-                    # if flow_block:
-                    #     continue
                     # flow search
                     chunk_info = self.flow_tracer.flow_search(
                         start_idx, start_dim, end_idx, end_dim, self.index_tracer

From 522f01741864f3565f8e97837ecc7289774ee127 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 23 Dec 2022 13:41:51 +0800
Subject: [PATCH 047/209] code style

---
 chunk_codegen.py | 39 +++++++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 0b0a164fe999..a8b970116d1d 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1004,7 +1004,7 @@ def _assgin_single_node_flow(
 
         # if already in node_info, arg dim must be same
         if arg_node in all_node_info:
-            if all_node_info[arg_node]['chunk_dim'] != arg_dim:
+            if all_node_info[arg_node]["chunk_dim"] != arg_dim:
                 return False
             all_node_info[arg_node]["fix_dim"] = list(
                 set(all_node_info[arg_node]["fix_dim"] + arg_fix_dim)
@@ -1132,16 +1132,19 @@ def flow_search(
         # get all possible prepose nodes
         maybe_prepose_nodes = []
         for node, node_info in all_node_info.items():
-            if node_info['chunk_dim'] is None:
+            if node_info["chunk_dim"] is None:
                 maybe_prepose_nodes.append(node)
-        maybe_prepose_nodes.sort(key=lambda x: _find_idx_by_name(x.name, index_tracer.nodes_list), reverse=True) # from last node to first node
+        maybe_prepose_nodes.sort(
+            key=lambda x: _find_idx_by_name(x.name, index_tracer.nodes_list),
+            reverse=True,
+        )  # from last node to first node
         prepose_nodes = []
         # set every node as root, search its args, if all legal, turn root and args as prepose nodes
         while len(maybe_prepose_nodes) > 0:
             tmp_cur_prepose_nodes = [maybe_prepose_nodes[0]]
             tmp_cur_related_prepose_nodes = []
             prepose_flag = True
-            
+
             # loop cur node's all arg until out of chunk
             while len(tmp_cur_prepose_nodes) > 0:
                 tmp_next_prepose_nodes = []
@@ -1151,20 +1154,28 @@ def flow_search(
                         if type(cur_prepose_node_arg) != type(cur_prepose_node):
                             continue
                         # out of loop
-                        if not (start_idx <= _find_idx_by_name(cur_prepose_node_arg.name, self.node_list) < end_idx):
+                        if not (
+                            start_idx
+                            <= _find_idx_by_name(
+                                cur_prepose_node_arg.name, self.node_list
+                            )
+                            < end_idx
+                        ):
                             continue
                         # compute op in loop
                         elif cur_prepose_node_arg in all_node_info:
-                            if all_node_info[cur_prepose_node_arg]['chunk_dim'] is None:
+                            if all_node_info[cur_prepose_node_arg]["chunk_dim"] is None:
                                 tmp_next_prepose_nodes.append(cur_prepose_node_arg)
                             else:
                                 prepose_flag = False
-                                break; break; break
+                                break
+                                break
+                                break
                         # non compute op
                         else:
                             tmp_next_prepose_nodes.append(cur_prepose_node_arg)
                 tmp_cur_prepose_nodes = tmp_next_prepose_nodes
-            
+
             if prepose_flag == False:
                 maybe_prepose_nodes.remove(maybe_prepose_nodes[0])
                 continue
@@ -1175,21 +1186,21 @@ def flow_search(
                     if n in maybe_prepose_nodes:
                         maybe_prepose_nodes.remove(n)
         # sort by index
-        prepose_nodes.sort(key=lambda x: _find_idx_by_name(x.name, index_tracer.nodes_list))
+        prepose_nodes.sort(
+            key=lambda x: _find_idx_by_name(x.name, index_tracer.nodes_list)
+        )
         chunk_info["args"]["prepose_nodes"] = prepose_nodes
-        
+
         # we need to log input nodes to avoid deleteing them in the loop
         chunk_node_list = self.node_list[start_idx : end_idx + 1]
         # also need to get some prepose node's arg out of non_chunk_inputs
         for n in prepose_nodes:
             chunk_node_list.remove(n)
-        non_chunk_inputs = _find_chunk_all_input_nodes(
-            chunk_node_list
-        )
+        non_chunk_inputs = _find_chunk_all_input_nodes(chunk_node_list)
         for i in non_chunk_inputs:
             if i not in chunk_info["inputs"] and i not in prepose_nodes:
                 chunk_info["inputs_non_chunk"].append(i)
-        
+
         return chunk_info
 
 
From d309e9338bde716ca356af8a27e0c484e97abbd9 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 23 Dec 2022 14:26:12 +0800
Subject: [PATCH 048/209] adapt codegen to prepose node

---
 chunk_codegen.py | 43 +++++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index a8b970116d1d..e3a7643d7499 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1198,7 +1198,7 @@ def flow_search(
             chunk_node_list.remove(n)
         non_chunk_inputs = _find_chunk_all_input_nodes(chunk_node_list)
         for i in non_chunk_inputs:
-            if i not in chunk_info["inputs"] and i not in prepose_nodes:
+            if i not in chunk_info["inputs"]:
                 chunk_info["inputs_non_chunk"].append(i)
 
         return chunk_info
@@ -1425,6 +1425,7 @@ def estimate_chunk_inference_mem(
                 ) / (1024**2)
 
             # determine chunk ratio for current node
+            # TODO: adapt to prepose node memory
             if chunk_within:
                 chunk_ratio = self._get_chunk_ratio(
                     node,
@@ -1602,7 +1603,6 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
         chunk_infos = []
         for end_dim, end_trace_idx in enumerate(end_trace["idx"]):
             if len(start_traces) > 1:
-                # TODO: implement multi input chunk
                 continue
             for start_node, start_trace in start_traces.items():
                 for start_dim, start_trace_idx in enumerate(start_trace["idx"]):
@@ -1831,7 +1831,6 @@ def _find_chunk_compute_input_and_output_nodes(nodes: List[Node]):
 
     # if a node has a user node which is not in the node list
     # we treat that user node as the node receiving the current node output
-    # TODO: it is unsafe to remove non compute node here
     for node in nodes:
         for output_node in node.users.keys():
             if (
@@ -1900,6 +1899,8 @@ def emit_code_with_chunk(
 
     chunk_outputs = [i["outputs"][0] for i in chunk_search]
     chunk_outputs_dim = [i["outputs_dim"] for i in chunk_search]
+    
+    chunk_prepose_nodes = [i["args"]["prepose_nodes"] for i in chunk_search]
 
     node_idx = 0
     region_idx = 0
@@ -1911,7 +1912,11 @@ def emit_code_with_chunk(
         if node_idx in chunk_starts:
             within_chunk_region = True
             region_idx = chunk_starts.index(node_idx)
-
+            # add prepose nodes
+            for i in chunk_prepose_nodes[region_idx]:
+                prepose_node = node_list[_find_idx_by_name(i.name, node_list)]
+                emit_node_func(prepose_node, body)
+                delete_unused_value_func(prepose_node, body, chunk_inputs_names)
             # add for loop
             body.append(
                 _gen_loop_start(
@@ -1922,20 +1927,22 @@ def emit_code_with_chunk(
             )
 
         if within_chunk_region:
-            emit_node_func(node, body)
-            # replace input var with chunk var
-            for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
-                for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
-                    if idx == node_idx:
-                        chunk_slice = _gen_chunk_slice_dim(
-                            dim, "chunk_idx", _get_node_shape(input_node)
-                        )
-                        body[-1] = _replace_name(
-                            body[-1], input_node.name, input_node.name + chunk_slice
-                        )
-            body[-1] = "    " + body[-1]
-            delete_unused_value_func(node, body, chunk_inputs_names)
-
+            if any(node.name == i.name for i in chunk_prepose_nodes[region_idx]):
+                pass
+            else:
+                emit_node_func(node, body)
+                # replace input var with chunk var
+                for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
+                    for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
+                        if idx == node_idx:
+                            chunk_slice = _gen_chunk_slice_dim(
+                                dim, "chunk_idx", _get_node_shape(input_node)
+                            )
+                            body[-1] = _replace_name(
+                                body[-1], input_node.name, input_node.name + chunk_slice
+                            )
+                body[-1] = "    " + body[-1]
+                delete_unused_value_func(node, body, chunk_inputs_names)
         else:
             emit_node_func(node, body)
             if node_idx not in chunk_inputs:

From 49ba619085c33eef372e73b6a45aecdc3d37937f Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 23 Dec 2022 14:26:43 +0800
Subject: [PATCH 049/209] code style

---
 chunk_codegen.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index e3a7643d7499..40196285ec8c 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1899,7 +1899,7 @@ def emit_code_with_chunk(
 
     chunk_outputs = [i["outputs"][0] for i in chunk_search]
     chunk_outputs_dim = [i["outputs_dim"] for i in chunk_search]
-    
+
     chunk_prepose_nodes = [i["args"]["prepose_nodes"] for i in chunk_search]
 
     node_idx = 0
@@ -1933,7 +1933,9 @@ def emit_code_with_chunk(
                 emit_node_func(node, body)
                 # replace input var with chunk var
                 for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
-                    for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
+                    for idx, dim in chunk_inputs_dim[region_idx][
+                        input_node_idx
+                    ].items():
                         if idx == node_idx:
                             chunk_slice = _gen_chunk_slice_dim(
                                 dim, "chunk_idx", _get_node_shape(input_node)

From 4d89525fc2f828c9c65bf4077b677db9a78c8466 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 23 Dec 2022 14:28:49 +0800
Subject: [PATCH 050/209] remove abandoned function

---
 chunk_codegen.py | 106 -----------------------------------------------
 1 file changed, 106 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 40196285ec8c..e2786d5e244f 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -857,112 +857,6 @@ def trace_flow(self):
                     )
         return self.flow_trace
 
-    def _detect_flow(
-        self, start_idx, start_dim, end_idx, end_dim, index_tracer: IndexTracer
-    ):
-        inputs, outputs = _find_chunk_compute_input_and_output_nodes(
-            self.node_list[start_idx : end_idx + 1]
-        )
-        chunk_info = {
-            "region": (start_idx, end_idx),
-            "inputs": inputs,
-            "inputs_non_chunk": [],
-            "inputs_dim": start_dim,
-            "outputs": outputs,
-            "outputs_dim": end_dim,
-            "args": {},
-        }
-        flow_block = False
-
-        # TODO don't allow multi outputs now
-        if len(outputs) > 1:
-            flow_block = True
-            return flow_block, chunk_info
-
-        # for idx in range(start_idx, end_idx + 1):
-        #     node = self.node_list[idx]
-        #     mix_flow_node = self._get_flow_mix_node(node)
-        #     if mix_flow_node is None:
-        #         continue
-
-        #     # if there is a flow mix, op must be in [mul, add, matmul]
-        #     # element-wise op requires dim to be equal in every dim
-        #     if any(n in node.name for n in ["mul", "add"]):
-        #         for i in node.args:
-        #             if type(i) == type(mix_flow_node) and i != mix_flow_node:
-        #                 main_flow_var = i
-        #         # if mix flow is a broadcast in chunk dim,
-        #         # TODO: need to move that flow out of the chunk
-        #         mix_flow_node_dim = index_tracer.get_node_chunk_dim(
-        #             self.node_list[end_idx], end_dim, node
-        #         )
-        #         # TODO: we need to loop every dim
-        #         if isinstance(mix_flow_node_dim, list):
-        #             mix_flow_node_dim = mix_flow_node_dim[0]
-        #         if mix_flow_node_dim is None:
-        #             flow_block = True
-        #             break
-        #         if _get_node_shape(mix_flow_node)[mix_flow_node_dim] == 1:
-        #             flow_block = False
-        #             for i in self._get_same_flow_node(
-        #                 chunk_info["inputs"], mix_flow_node
-        #             ):
-        #                 chunk_info["inputs"].remove(i)
-        #         # else, we need to chunk mix var as well
-        #         else:
-        #             # TODO chunk another value
-        #             flow_block = True
-        #             break
-        #     else:
-        #         raise NotImplementedError("%s not implemented" % node.name)
-        # if flow_block:
-        #     flow_block = True
-        #     return flow_block, chunk_info
-
-        inputs_dim = []
-        remove_inputs = []
-        for input_node in chunk_info["inputs"]:
-            input_dict = {}
-            for user in input_node.users.keys():
-                if _is_non_compute_node(user):
-                    continue
-                user_idx = _find_idx_by_name(user.name, self.node_list)
-                dim = None
-                if start_dim <= user_idx < end_idx:
-                    dim = index_tracer.get_node_chunk_dim(
-                        self.node_list[end_idx], end_dim, input_node
-                    )
-                    # TODO: we need to loop every dim
-                    if isinstance(dim, list):
-                        dim = dim[0]
-                elif user_idx == end_idx:
-                    dim = end_dim
-                # n has relation with chunk dim
-                if dim is not None and _get_node_shape(user)[dim] != 1:
-                    input_dict[user_idx] = dim
-            if len(input_dict) == 0:
-                remove_inputs.append(input_node)
-            else:
-                inputs_dim.append(input_dict)
-        chunk_info["inputs_dim"] = inputs_dim
-        for i in remove_inputs:
-            if i in chunk_info["inputs"]:
-                chunk_info["inputs"].remove(i)
-
-        duplicate_result, duplicate_dim = index_tracer.check_index_duplicate(
-            chunk_info, return_dim=True
-        )
-
-        # we need to log input nodes to avoid deleteing them in the loop
-        non_chunk_inputs = _find_chunk_all_input_nodes(
-            self.node_list[start_idx : end_idx + 1]
-        )
-        for i in non_chunk_inputs:
-            if i not in chunk_info["inputs"]:
-                chunk_info["inputs_non_chunk"].append(i)
-
-        return flow_block, chunk_info
-
     def _assgin_single_node_flow(
         self,
         arg_node,

From 4f5e105af30fccb4b0595edd341bdd7a4b226aa9 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 23 Dec 2022 15:34:41 +0800
Subject: [PATCH 051/209] remove flow tracer

---
 chunk_codegen.py | 171 ++++++++---------------------------------------
 1 file changed, 27 insertions(+), 144 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index e2786d5e244f..838f53949de7 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -67,7 +67,7 @@ def _is_non_compute_node_except_placeholder_output(node):
 class IndexTracer(object):
     def __init__(self, gm) -> None:
         self.gm = gm
-        self.nodes_list = list(gm.graph.nodes)
+        self.node_list = list(gm.graph.nodes)
         self.idx_trace_list = self._init_idx_trace_list()
         self.idx_trace_equal = []
         self.idx_view_list = []
@@ -75,7 +75,7 @@ def __init__(self, gm) -> None:
 
     def _init_idx_trace_list(self):
         idx_trace_list = []
-        for n in self.nodes_list:
+        for n in self.node_list:
             if _get_node_shape(n) != None:
                 cur_trace = {
                     "idx": [None for _ in range(len(_get_node_shape(n)))],
@@ -136,7 +136,7 @@ def _add_source(self, node_from, node_from_dim, node_to, node_to_dim, init=False
         node_from_trace = self._find_trace_from_node(node_from)
         node_to_dim = self._transform_index(node_to, node_to_dim)
         node_to_trace = self._find_trace_from_node(node_to)
-        node_from_idx = _find_idx_by_name(node_from.name, self.nodes_list)
+        node_from_idx = _find_idx_by_name(node_from.name, self.node_list)
         if init:
             node_to_trace["source"][node_to_dim] = {}
         # add dim to cur new source
@@ -210,7 +210,7 @@ def _find_trace_from_node(self, node):
             idx (list): idx of the node
             compute (list): computed idx of the node.
         """
-        node_idx = _find_idx_by_name(node.name, self.nodes_list)
+        node_idx = _find_idx_by_name(node.name, self.node_list)
         node_dict = self.idx_trace_list[node_idx]
         return node_dict
 
@@ -224,7 +224,7 @@ def _find_source_trace_from_node(self, node):
             idx (list): idx of the node
             compute (list): computed idx of the node.
         """
-        node_idx = _find_idx_by_name(node.name, self.nodes_list)
+        node_idx = _find_idx_by_name(node.name, self.node_list)
         node_dict = self.idx_trace_list[node_idx]
         return node_dict["source"]
 
@@ -237,7 +237,7 @@ def _find_idx_trace_from_node(self, node):
         Returns:
             idx (list): idx of the node
         """
-        node_idx = _find_idx_by_name(node.name, self.nodes_list)
+        node_idx = _find_idx_by_name(node.name, self.node_list)
         return self.idx_trace_list[node_idx]["idx"]
 
     def _find_compute_trace_from_node(self, node):
@@ -249,7 +249,7 @@ def _find_compute_trace_from_node(self, node):
         Returns:
             compute (list): computed idx of the node.
         """
-        node_idx = _find_idx_by_name(node.name, self.nodes_list)
+        node_idx = _find_idx_by_name(node.name, self.node_list)
         return self.idx_trace_list[node_idx]["compute"]
 
     def _assign_index_as_input(self, node, node_idx, input_node=None):
@@ -262,7 +262,7 @@ def _assign_index_as_input(self, node, node_idx, input_node=None):
         """
         if input_node == None:
             input_node = node.args[0]
-        input_node_idx = _find_idx_by_name(input_node.name, self.nodes_list)
+        input_node_idx = _find_idx_by_name(input_node.name, self.node_list)
         input_node_idx_trace = self.idx_trace_list[input_node_idx]["idx"]
 
         new_idx_trace = copy.deepcopy(input_node_idx_trace)
@@ -591,7 +591,7 @@ def _merge_equal_idx(self):
                     ]
 
     def trace_index(self):
-        for idx, node in enumerate(self.nodes_list):
+        for idx, node in enumerate(self.node_list):
             if node.op == "placeholder":
                 self._assign_all_index(node, idx)
             elif node.op == "call_method":
@@ -655,7 +655,7 @@ def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node
         Returns:
             bool: True if check pass
         """
-        start_node_idx = _find_idx_by_name(start_node.name, self.nodes_list)
+        start_node_idx = _find_idx_by_name(start_node.name, self.node_list)
         end_node_trace = self._find_trace_from_node(end_node)
         end_node_trace_source = end_node_trace["source"][end_dim]
         sorted_source = sorted(
@@ -690,14 +690,14 @@ def check_index_compute(self, start_idx, end_dim, end_node, end_idx):
     def get_node_chunk_dim(self, node_from, node_from_dim, node_to):
         node_from_source = self._find_source_trace_from_node(node_from)
         dim_source = node_from_source[node_from_dim]
-        node_to_idx = _find_idx_by_name(node_to.name, self.nodes_list)
+        node_to_idx = _find_idx_by_name(node_to.name, self.node_list)
         for k, v in dim_source.items():
             if k == node_to_idx:
                 return v
         return None
 
     def _find_inherit_dim(self, input_node, input_dim, node):
-        input_node_idx = _find_idx_by_name(input_node.name, self.nodes_list)
+        input_node_idx = _find_idx_by_name(input_node.name, self.node_list)
         node_trace_source = self._find_source_trace_from_node(node)
         for node_dim in range(len(_get_node_shape(node))):
             if (
@@ -711,11 +711,11 @@ def check_index_duplicate(self, chunk_infos, return_dim=False):
         input_dim_after_node = {}
         for input_node_idx, input_node in enumerate(chunk_infos["inputs"]):
             for k, v in chunk_infos["inputs_dim"][input_node_idx].items():
-                inherit_dim = self._find_inherit_dim(input_node, v, self.nodes_list[k])
+                inherit_dim = self._find_inherit_dim(input_node, v, self.node_list[k])
                 if inherit_dim:
                     input_dim_after_node[k] = inherit_dim
 
-        for node in self.nodes_list[
+        for node in self.node_list[
             chunk_infos["region"][0] : chunk_infos["region"][1] + 1
         ]:
             if _is_non_compute_node_except_placeholder(node):
@@ -746,124 +746,11 @@ def check_index_duplicate(self, chunk_infos, return_dim=False):
         else:
             return True
 
-
-class FlowTracer(object):
-    def __init__(self, gm) -> None:
-        self.gm = gm
-        self.node_list = list(gm.graph.nodes)
-        self.flow_trace = {}
-
-    def _add_trace(self, name):
-        self.flow_trace[name] = []
-
-    def _add_node(self, trace_name, node):
-        self.flow_trace[trace_name].append(
-            {"node": node, "inside_depend": [], "outside_depend": []}
-        )
-
-    def _add_inside_depend(self, flow_name, node, inside_depend_node):
-        for i in self.flow_trace[flow_name]:
-            if i["node"] == node:
-                i["inside_depend"].append(inside_depend_node)
-                return
-        raise RuntimeError("node not found")
-
-    def _add_outside_depend(
-        self, flow_name, node, outside_depend_node, outside_depend_trace
-    ):
-        for i in self.flow_trace[flow_name]:
-            if i["node"] == node:
-                i["outside_depend"].append({outside_depend_trace: outside_depend_node})
-                return
-        raise RuntimeError("node not found")
-
-    def _init_trace(self):
-        for i in self.node_list:
-            if i.op == "placeholder":
-                self._add_trace(i.name)
-                self._add_node(i.name, i)
-
-    def _find_flow_for_node(self, node):
-        if type(self.node_list[0]) != type(node):
-            return None
-        if _is_non_compute_node_except_placeholder(node):
-            return None
-        for name, trace in self.flow_trace.items():
-            for i in trace:
-                if node == i["node"]:
-                    return name
-        if any(i in node.name for i in ["ones_like"]):
-            self._add_trace(node.name)
-            self._add_node(node.name, node)
-            return node.name
-        raise RuntimeError("node not found")
-
-    def _find_first_valid_flow(self, flow):
-        for i in flow:
-            if i is not None:
-                return i
-        raise RuntimeError("invalid flow")
-
-    def find_node_flow(self, node):
-        for name, trace in self.flow_trace.items():
-            for i in trace:
-                if node == i["node"]:
-                    return name, i
-        raise RuntimeError("invalid node")
-
-    def _get_flow_mix_node(self, node):
-        if _is_non_compute_node(node):
-            return None
-        _, node_trace = self.find_node_flow(node)
-        if len(node_trace["outside_depend"]) == 0:
-            return None
-        elif len(node_trace["outside_depend"]) > 1:
-            raise NotImplementedError
-        vars = list(node_trace["outside_depend"][0].values())[0]
-        return vars
-
-    def _get_same_flow_node(self, node_list, node):
-        name, _ = self.find_node_flow(node)
-        result = []
-        for i in self.flow_trace[name]:
-            if i["node"] in node_list:
-                result.append(i["node"])
-        return result
-
-    def trace_flow(self):
-        # init trace
-        self._init_trace()
-
-        for node in self.node_list:
-            # skip if non compute node
-            if all(
-                type(arg) != type(node) or _is_non_compute_node_except_placeholder(arg)
-                for arg in node.args
-            ) or _is_non_compute_node(node):
-                continue
-
-            node_input_flows = [self._find_flow_for_node(arg) for arg in node.args]
-
-            node_domin_flow = self._find_first_valid_flow(node_input_flows)
-            self._add_node(node_domin_flow, node)
-            for node_input_flow, arg in zip(node_input_flows, node.args):
-                if node_input_flow is None:
-                    continue
-                elif node_input_flow == node_domin_flow:
-                    self._add_inside_depend(node_domin_flow, node, arg)
-                else:
-                    self._add_outside_depend(
-                        node_domin_flow, node, arg, node_input_flow
-                    )
-        return self.flow_trace
-
     def _assgin_single_node_flow(
         self,
         arg_node,
         start_idx,
         end_idx,
-        inputs,
-        index_tracer,
         cur_node_dim,
         cur_node_compute,
         cur_node_source,
@@ -871,7 +758,7 @@ def _assgin_single_node_flow(
         all_node_info,
         next_node_list,
     ):
-        arg_idx = _find_idx_by_name(arg_node.name, index_tracer.nodes_list)
+        arg_idx = _find_idx_by_name(arg_node.name, self.node_list)
         # arg in chunk range or be inputs
         if not (start_idx <= arg_idx < end_idx):
             return True
@@ -911,7 +798,7 @@ def _assgin_single_node_flow(
         return True
 
     def flow_search(
-        self, start_idx, start_dim, end_idx, end_dim, index_tracer: IndexTracer
+        self, start_idx, start_dim, end_idx, end_dim
     ):
         inputs, outputs = _find_chunk_compute_input_and_output_nodes(
             self.node_list[start_idx : end_idx + 1]
@@ -920,7 +807,7 @@ def flow_search(
         if len(outputs) > 1:
             return None
 
-        cur_node_list = [index_tracer.nodes_list[end_idx]]  # start from the last node
+        cur_node_list = [self.node_list[end_idx]]  # start from the last node
         all_node_info = {cur_node_list[0]: {"chunk_dim": end_dim, "fix_dim": []}}
 
         while len(cur_node_list) > 0:
@@ -930,12 +817,12 @@ def flow_search(
                 # get cur node info
                 cur_node_chunk_dim = all_node_info[cur_node]["chunk_dim"]
                 cur_node_fix_dim = all_node_info[cur_node]["fix_dim"]
-                cur_node_idx = _find_idx_by_name(cur_node.name, index_tracer.nodes_list)
+                cur_node_idx = _find_idx_by_name(cur_node.name, self.node_list)
                 if cur_node_chunk_dim:
-                    cur_node_compute = index_tracer._find_compute_trace_from_node(
+                    cur_node_compute = self._find_compute_trace_from_node(
                         cur_node
                     )
-                    cur_node_source = index_tracer._find_source_trace_from_node(
+                    cur_node_source = self._find_source_trace_from_node(
                         cur_node
                     )
                 else:
@@ -953,8 +840,6 @@ def flow_search(
                         arg,
                         start_idx,
                         end_idx,
-                        inputs,
-                        index_tracer,
                         cur_node_chunk_dim,
                         cur_node_compute,
                         cur_node_source,
@@ -970,7 +855,7 @@ def flow_search(
                         for arg in arg_list:
                             if not (
                                 start_idx
-                                <= _find_idx_by_name(arg.name, index_tracer.nodes_list)
+                                <= _find_idx_by_name(arg.name, self.node_list)
                                 < end_idx
                             ):
                                 continue
@@ -1029,7 +914,7 @@ def flow_search(
             if node_info["chunk_dim"] is None:
                 maybe_prepose_nodes.append(node)
         maybe_prepose_nodes.sort(
-            key=lambda x: _find_idx_by_name(x.name, index_tracer.nodes_list),
+            key=lambda x: _find_idx_by_name(x.name, self.node_list),
             reverse=True,
         )  # from last node to first node
         prepose_nodes = []
@@ -1081,7 +966,7 @@ def flow_search(
                         maybe_prepose_nodes.remove(n)
         # sort by index
         prepose_nodes.sort(
-            key=lambda x: _find_idx_by_name(x.name, index_tracer.nodes_list)
+            key=lambda x: _find_idx_by_name(x.name, self.node_list)
         )
         chunk_info["args"]["prepose_nodes"] = prepose_nodes
 
@@ -1226,9 +1111,9 @@ def _get_chunk_ratio(self, node, chunk_inputs, chunk_inputs_dim, chunk_size):
             for k, v in input_node_dim.items():
                 # TODO: inherit dim should be list too, int now
                 inherit_dim = self.index_tracer._find_inherit_dim(
-                    input_node, v, self.index_tracer.nodes_list[k]
+                    input_node, v, self.index_tracer.node_list[k]
                 )
-                if k == _find_idx_by_name(node.name, self.index_tracer.nodes_list):
+                if k == _find_idx_by_name(node.name, self.index_tracer.node_list):
                     chunk_ratio = float(chunk_size) / node_shape[inherit_dim]
                     return chunk_ratio
                 for dim, source in enumerate(node_source):
@@ -1412,8 +1297,6 @@ def __init__(self, gm) -> None:
         self.node_list = list(gm.graph.nodes)
         self.index_tracer = IndexTracer(gm)
         self.index_tracer.trace_index()
-        self.flow_tracer = FlowTracer(gm)
-        self.flow_tracer.trace_flow()
         self.memory_estimator = MemoryEstimator(self.index_tracer)
 
     def _find_peak_node(self, mem_peak):
@@ -1517,8 +1400,8 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
                     ):
                         continue
                     # flow search
-                    chunk_info = self.flow_tracer.flow_search(
-                        start_idx, start_dim, end_idx, end_dim, self.index_tracer
+                    chunk_info = self.index_tracer.flow_search(
+                        start_idx, start_dim, end_idx, end_dim
                     )
                     if chunk_info is None:
                         continue

From fa5e6fbf96448ebff1dc682e749a3f73a5a9c2b5 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 23 Dec 2022 15:38:37 +0800
Subject: [PATCH 052/209] code style

---
 chunk_codegen.py | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 838f53949de7..e80b0fd9be77 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -65,9 +65,8 @@ def _is_non_compute_node_except_placeholder_output(node):
 
 
 class IndexTracer(object):
-    def __init__(self, gm) -> None:
-        self.gm = gm
-        self.node_list = list(gm.graph.nodes)
+    def __init__(self, node_list) -> None:
+        self.node_list = node_list
         self.idx_trace_list = self._init_idx_trace_list()
         self.idx_trace_equal = []
         self.idx_view_list = []
@@ -797,9 +796,7 @@ def _assgin_single_node_flow(
         next_node_list.append(arg_node)
         return True
 
-    def flow_search(
-        self, start_idx, start_dim, end_idx, end_dim
-    ):
+    def flow_search(self, start_idx, start_dim, end_idx, end_dim):
         inputs, outputs = _find_chunk_compute_input_and_output_nodes(
             self.node_list[start_idx : end_idx + 1]
         )
@@ -819,12 +816,8 @@ def flow_search(
                 cur_node_fix_dim = all_node_info[cur_node]["fix_dim"]
                 cur_node_idx = _find_idx_by_name(cur_node.name, self.node_list)
                 if cur_node_chunk_dim:
-                    cur_node_compute = self._find_compute_trace_from_node(
-                        cur_node
-                    )
-                    cur_node_source = self._find_source_trace_from_node(
-                        cur_node
-                    )
+                    cur_node_compute = self._find_compute_trace_from_node(cur_node)
+                    cur_node_source = self._find_source_trace_from_node(cur_node)
                 else:
                     cur_node_compute = cur_node_source = None
 
@@ -965,9 +958,7 @@ def flow_search(
                     if n in maybe_prepose_nodes:
                         maybe_prepose_nodes.remove(n)
         # sort by index
-        prepose_nodes.sort(
-            key=lambda x: _find_idx_by_name(x.name, self.node_list)
-        )
+        prepose_nodes.sort(key=lambda x: _find_idx_by_name(x.name, self.node_list))
         chunk_info["args"]["prepose_nodes"] = prepose_nodes
 
         # we need to log input nodes to avoid deleteing them in the loop
@@ -1295,7 +1286,9 @@ class ChunkRegionSearch(object):
     def __init__(self, gm) -> None:
         self.gm = gm
         self.node_list = list(gm.graph.nodes)
-        self.index_tracer = IndexTracer(gm)
+        self.index_tracer = IndexTracer(
+            self.node_list
+        )  # node list shared in index tracer
         self.index_tracer.trace_index()
         self.memory_estimator = MemoryEstimator(self.index_tracer)
 

From e0ae68e736cb56015fd1316113d52affaaf27749 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 23 Dec 2022 15:49:04 +0800
Subject: [PATCH 053/209] code style

---
 chunk_codegen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index e80b0fd9be77..6e772aa8a56a 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1497,8 +1497,8 @@ def search_region(self):
             chunk_info = self._step_search(mem_peak, active_node, chunk_infos)
             if chunk_info is None:
                 break
-
             chunk_infos.append(chunk_info)
+
             (
                 mem_peak,
                 _,

From 884a228ea674b02998575776b0069b15de0b7a10 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 23 Dec 2022 17:06:07 +0800
Subject: [PATCH 054/209] reorder nodes

---
 chunk_codegen.py | 127 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 101 insertions(+), 26 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 6e772aa8a56a..4b3b04d93b91 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -71,6 +71,7 @@ def __init__(self, node_list) -> None:
         self.idx_trace_equal = []
         self.idx_view_list = []
         self.idx_count = -1
+        self.all_reorder_map = {i: i for i in range(len(self.idx_trace_list))}
 
     def _init_idx_trace_list(self):
         idx_trace_list = []
@@ -973,6 +974,91 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
 
         return chunk_info
 
+    def _get_reorder_map(self, chunk_info):
+        reorder_map = {i: i for i in range(len(self.node_list))}
+
+        chunk_region_start = chunk_info["region"][0]
+        chunk_region_end = chunk_info["region"][1]
+        chunk_prepose_nodes = chunk_info["args"]["prepose_nodes"]
+        chunk_prepose_nodes_idx = [
+            _find_idx_by_name(i.name, self.node_list) for i in chunk_prepose_nodes
+        ]
+        # put prepose nodes ahead
+        for idx, n in enumerate(chunk_prepose_nodes):
+            n_idx = chunk_prepose_nodes_idx[idx]
+            reorder_map[n_idx] = chunk_region_start + idx
+        # put other nodes after prepose nodes
+        for n in self.node_list[chunk_region_start : chunk_region_end + 1]:
+            if n in chunk_prepose_nodes:
+                continue
+            n_idx = _find_idx_by_name(n.name, self.node_list)
+            pos = sum([n_idx < i for i in chunk_prepose_nodes_idx])
+            reorder_map[n_idx] = n_idx + pos
+
+        return reorder_map
+
+    def _reorder_chunk_info(self, chunk_info, reorder_map):
+        # update chunk info
+        chunk_info["region"] = (
+            chunk_info["region"][0] + len(chunk_info["args"]["prepose_nodes"]),
+            chunk_info["region"][1],
+        )
+        for idx, input_dim in enumerate(chunk_info["inputs_dim"]):
+            new_input_dim = {}
+            for k, v in input_dim.items():
+                new_input_dim[reorder_map[k]] = v
+            chunk_info["inputs_dim"][idx] = new_input_dim
+        return chunk_info
+
+    def _update_all_reorder_map(self, reorder_map):
+        for origin_idx, map_idx in self.all_reorder_map.items():
+            self.all_reorder_map[origin_idx] = reorder_map[map_idx]
+
+    def _reorder_self_node_list(self, reorder_map):
+        new_node_list = [None for _ in range(len(self.node_list))]
+        for old_idx, new_idx in reorder_map.items():
+            new_node_list[new_idx] = self.node_list[old_idx]
+        self.node_list = new_node_list
+
+    def _reorder_idx_trace(self, reorder_map):
+        # reorder list
+        new_idx_trace_list = [None for _ in range(len(self.idx_trace_list))]
+        for old_idx, new_idx in reorder_map.items():
+            new_idx_trace_list[new_idx] = self.idx_trace_list[old_idx]
+        self.idx_trace_list = new_idx_trace_list
+        # update compute
+        for idx_trace in self.idx_trace_list:
+            compute = idx_trace["compute"]
+            for dim_compute in compute:
+                for idx, i in enumerate(dim_compute):
+                    dim_compute[idx] = reorder_map[i]
+        # update source
+        for idx_trace in self.idx_trace_list:
+            source = idx_trace["source"]
+            for dim_idx, dim_source in enumerate(source):
+                new_dim_source = {}
+                for k, v in dim_source.items():
+                    new_dim_source[reorder_map[k]] = v
+                source[dim_idx] = new_dim_source
+
+    def reorder_all(self, chunk_info):
+        if chunk_info is None:
+            return chunk_info
+        if len(chunk_info["args"]["prepose_nodes"]) == 0:
+            return chunk_info
+        reorder_map = self._get_reorder_map(chunk_info)
+        self._update_all_reorder_map(reorder_map)
+        self._reorder_idx_trace(reorder_map)
+        self._reorder_self_node_list(reorder_map)
+        chunk_info = self._reorder_chunk_info(chunk_info, reorder_map)
+        return chunk_info
+
+    def reorder_node_list(self, node_list):
+        new_node_list = [None for _ in range(len(node_list))]
+        for old_idx, new_idx in self.all_reorder_map.items():
+            new_node_list[new_idx] = node_list[old_idx]
+        return new_node_list
+
 
 class MemoryEstimator(object):
     def __init__(self, index_tracer: IndexTracer) -> None:
@@ -1476,6 +1562,7 @@ def _step_search(self, mem_peak, active_node, chunk_regions):
         best_chunk_region = self._search_best_chunk_region(
             possible_chunk_regions, chunk_regions
         )
+        best_chunk_region = self.index_tracer.reorder_all(best_chunk_region)
         return best_chunk_region
 
     def _stop_search(self, init_mem_peak, mem_peak):
@@ -1670,8 +1757,7 @@ def emit_code_with_chunk(
     chunk_outputs = [i["outputs"][0] for i in chunk_search]
     chunk_outputs_dim = [i["outputs_dim"] for i in chunk_search]
 
-    chunk_prepose_nodes = [i["args"]["prepose_nodes"] for i in chunk_search]
-
+    node_list = chunk_region_search.index_tracer.reorder_node_list(node_list)
     node_idx = 0
     region_idx = 0
     within_chunk_region = False
@@ -1682,12 +1768,6 @@ def emit_code_with_chunk(
         if node_idx in chunk_starts:
             within_chunk_region = True
             region_idx = chunk_starts.index(node_idx)
-            # add prepose nodes
-            for i in chunk_prepose_nodes[region_idx]:
-                prepose_node = node_list[_find_idx_by_name(i.name, node_list)]
-                emit_node_func(prepose_node, body)
-                delete_unused_value_func(prepose_node, body, chunk_inputs_names)
-            # add for loop
             body.append(
                 _gen_loop_start(
                     chunk_inputs[region_idx],
@@ -1697,24 +1777,19 @@ def emit_code_with_chunk(
             )
 
         if within_chunk_region:
-            if any(node.name == i.name for i in chunk_prepose_nodes[region_idx]):
-                pass
-            else:
-                emit_node_func(node, body)
-                # replace input var with chunk var
-                for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
-                    for idx, dim in chunk_inputs_dim[region_idx][
-                        input_node_idx
-                    ].items():
-                        if idx == node_idx:
-                            chunk_slice = _gen_chunk_slice_dim(
-                                dim, "chunk_idx", _get_node_shape(input_node)
-                            )
-                            body[-1] = _replace_name(
-                                body[-1], input_node.name, input_node.name + chunk_slice
-                            )
-                body[-1] = "    " + body[-1]
-                delete_unused_value_func(node, body, chunk_inputs_names)
+            emit_node_func(node, body)
+            # replace input var with chunk var
+            for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
+                for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
+                    if idx == node_idx:
+                        chunk_slice = _gen_chunk_slice_dim(
+                            dim, "chunk_idx", _get_node_shape(input_node)
+                        )
+                        body[-1] = _replace_name(
+                            body[-1], input_node.name, input_node.name + chunk_slice
+                        )
+            body[-1] = "    " + body[-1]
+            delete_unused_value_func(node, body, chunk_inputs_names)
         else:
             emit_node_func(node, body)
             if node_idx not in chunk_inputs:

From 51ef8384c153f46dcbb74c26eec523ad7cd0d51c Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 23 Dec 2022 17:25:36 +0800
Subject: [PATCH 055/209] finish node reorder

---
 chunk_codegen.py | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 4b3b04d93b91..9623a9d9bbe2 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1238,7 +1238,7 @@ def _print_compute_op_mem_log(self, log, nodes, title=None):
 
     def estimate_chunk_inference_mem(
         self,
-        gm: torch.fx.GraphModule,
+        node_list,
         chunk_infos=None,
     ):
         act_memory = 0.0
@@ -1247,7 +1247,6 @@ def estimate_chunk_inference_mem(
         active_node_list = []
         active_node_list_log = []
         not_contiguous_list = []
-        node_list = list(gm.graph.nodes)
         user_to_last_uses = self._get_last_usr(node_list)
         user_to_last_uses_no_free_var = self._get_last_usr(node_list)
         _delete_free_var_from_last_use(user_to_last_uses_no_free_var)
@@ -1281,7 +1280,6 @@ def estimate_chunk_inference_mem(
                 ) / (1024**2)
 
             # determine chunk ratio for current node
-            # TODO: adapt to prepose node memory
             if chunk_within:
                 chunk_ratio = self._get_chunk_ratio(
                     node,
@@ -1371,10 +1369,7 @@ def estimate_chunk_inference_mem(
 class ChunkRegionSearch(object):
     def __init__(self, gm) -> None:
         self.gm = gm
-        self.node_list = list(gm.graph.nodes)
-        self.index_tracer = IndexTracer(
-            self.node_list
-        )  # node list shared in index tracer
+        self.index_tracer = IndexTracer(list(gm.graph.nodes))
         self.index_tracer.trace_index()
         self.memory_estimator = MemoryEstimator(self.index_tracer)
 
@@ -1385,7 +1380,7 @@ def _find_peak_node(self, mem_peak):
 
     def _get_free_var(self):
         free_var_idx = []
-        for idx, n in enumerate(self.node_list):
+        for idx, n in enumerate(self.index_tracer.node_list):
             if n.op == "placeholder":
                 free_var_idx.append(idx)
         return free_var_idx
@@ -1455,13 +1450,13 @@ def _is_not_compute(self, trace, chunk_range, dim_idx):
     def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
         start_traces = input_trace[start_idx]
         end_trace = output_trace[end_idx]
-        end_node = self.node_list[end_idx]
+        end_node = self.index_tracer.node_list[end_idx]
         chunk_infos = []
-        for end_dim, end_trace_idx in enumerate(end_trace["idx"]):
+        for end_dim, _ in enumerate(end_trace["idx"]):
             if len(start_traces) > 1:
                 continue
             for start_node, start_trace in start_traces.items():
-                for start_dim, start_trace_idx in enumerate(start_trace["idx"]):
+                for start_dim, _ in enumerate(start_trace["idx"]):
                     # dim size cannot be 1
                     if (
                         _get_node_shape(end_node)[end_dim] == 1
@@ -1494,7 +1489,7 @@ def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
         possible_chunk_region = []
         output_trace = copy.deepcopy(self.index_tracer.idx_trace_list)
         input_trace = []  # trace of a node's input nodes
-        for _, n in enumerate(self.node_list):
+        for _, n in enumerate(self.index_tracer.node_list):
             cur_trace = {}
             for arg in n.args:
                 if type(arg) == type(n) and not _is_non_compute_node_except_placeholder(
@@ -1507,8 +1502,8 @@ def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
             for end_idx in range(peak_node, max_chunk_region[1] + 1):
                 # skip non compute nodes
                 if _is_non_compute_node(
-                    self.node_list[start_idx]
-                ) or _is_non_compute_node(self.node_list[end_idx]):
+                    self.index_tracer.node_list[start_idx]
+                ) or _is_non_compute_node(self.index_tracer.node_list[end_idx]):
                     continue
 
                 # select free dim
@@ -1577,7 +1572,9 @@ def search_region(self):
             init_mem_peak,
             _,
             active_node,
-        ) = self.memory_estimator.estimate_chunk_inference_mem(self.gm)
+        ) = self.memory_estimator.estimate_chunk_inference_mem(
+            self.index_tracer.node_list
+        )
         mem_peak = init_mem_peak
 
         while True:
@@ -1590,7 +1587,9 @@ def search_region(self):
                 mem_peak,
                 _,
                 active_node,
-            ) = self.memory_estimator.estimate_chunk_inference_mem(self.gm, chunk_infos)
+            ) = self.memory_estimator.estimate_chunk_inference_mem(
+                self.index_tracer.node_list, chunk_infos
+            )
             if self._stop_search(init_mem_peak, mem_peak):
                 break
         return chunk_infos

From 9b1b890347f345f1c4de2a0991e250dcaf94365a Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 23 Dec 2022 17:32:11 +0800
Subject: [PATCH 056/209] update run

---
 chunk_codegen_run.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/chunk_codegen_run.py b/chunk_codegen_run.py
index ae4653d6545b..3a3b3c599e3e 100644
--- a/chunk_codegen_run.py
+++ b/chunk_codegen_run.py
@@ -32,15 +32,25 @@ def _is_all_param_close(m: torch.nn.Module, gm: GraphModule) -> bool:
 
 
 def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
+    # now_mem = torch.cuda.memory_allocated() / 1024**2
+    # with torch.no_grad():
+    #     node0 = node.clone()
+    #     pair0 = pair.clone()
+    #     model.graph(node0, pair0, now_mem)        
+    # new_now_mem = torch.cuda.memory_allocated() / 1024**2
+    # new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
+    # print("\ncode now:%.2f max:%.2f" %(new_now_mem - now_mem, new_max_mem - now_mem))
+    
+    torch.cuda.reset_peak_memory_stats()
     now_mem = torch.cuda.memory_allocated() / 1024**2
     with torch.no_grad():
-        node0 = node.clone()
-        pair0 = pair.clone()
-        node1, pair1 = gm(node0, pair0)        
+        node1 = node.clone()
+        pair1 = pair.clone()
+        gm(node1, pair1)        
     new_now_mem = torch.cuda.memory_allocated() / 1024**2
     new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
-    print("now:%.2f max:%.2f" %(new_now_mem - now_mem, new_max_mem - now_mem))
-    
+    print("gm now:%.2f max:%.2f" %(new_now_mem - now_mem, new_max_mem - now_mem))
+            
     # test forward
     with torch.no_grad():
         non_fx_out = model(node, pair)

From 786a398a6bdea395e2ca8ddde87c87c8470d971b Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 23 Dec 2022 17:42:51 +0800
Subject: [PATCH 057/209] code style

---
 chunk_codegen.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 9623a9d9bbe2..f87a3a132e78 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -920,9 +920,13 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
 
             # loop cur node's all arg until out of chunk
             while len(tmp_cur_prepose_nodes) > 0:
+                if prepose_flag == False:
+                    break
                 tmp_next_prepose_nodes = []
                 tmp_cur_related_prepose_nodes.extend(tmp_cur_prepose_nodes)
                 for cur_prepose_node in tmp_cur_prepose_nodes:
+                    if prepose_flag == False:
+                        break
                     for cur_prepose_node_arg in cur_prepose_node.args:
                         if type(cur_prepose_node_arg) != type(cur_prepose_node):
                             continue
@@ -942,8 +946,6 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
                             else:
                                 prepose_flag = False
                                 break
-                                break
-                                break
                         # non compute op
                         else:
                             tmp_next_prepose_nodes.append(cur_prepose_node_arg)

From 1b8a066592821870bb8f7a6fce338481efd5140b Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 26 Dec 2022 15:28:01 +0800
Subject: [PATCH 058/209] add chunk select class

---
 chunk_codegen.py | 80 +++++++++++++++++++++++++++++-------------------
 1 file changed, 49 insertions(+), 31 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index f87a3a132e78..cdd0b1077487 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1368,12 +1368,60 @@ def estimate_chunk_inference_mem(
         return act_memory_peak_log, act_memory_after_node_log, active_node_list_log
 
 
+class ChunkSelector(object):
+    def __init__(self, index_tracer: IndexTracer, stratge) -> None:
+        self.index_tracer = index_tracer
+        assert stratge in ['min_memory', 'fit_memory']
+        self.stratge = stratge
+        self.max_memory = 800  # MB
+    
+    def _select_best_chunk_region(self, possible_chunk_regions, chunk_infos):
+        if self.stratge == 'min_memory':
+            best_region = self._select_min_memory_chunk_region(possible_chunk_regions, chunk_infos)
+        elif self.stratge == 'fit_memory':
+            pass
+        else:
+            raise RuntimeError()
+        return best_region
+    
+    def _select_min_memory_chunk_region(self, possible_chunk_regions, chunk_infos):
+        max_region_range = 0
+        best_region = None
+        while len(possible_chunk_regions) > 0:
+            for i in possible_chunk_regions:
+                if i["region"][1] - i["region"][0] > max_region_range:
+                    best_region = i
+                    max_region_range = i["region"][1] - i["region"][0]
+            if self._is_legal_region(best_region, chunk_infos):
+                break
+            possible_chunk_regions.remove(i)
+            max_region_range = 0
+            best_region = None
+        return best_region
+
+    def _is_legal_region(self, cur_chunk_info, chunk_infos):
+        (chunk_region_start, chunk_region_end) = cur_chunk_info["region"]
+        if cur_chunk_info in chunk_infos:
+            return False
+        if chunk_region_end < chunk_region_start:
+            return False
+        for i in chunk_infos:
+            region = i["region"]
+            if not (
+                (chunk_region_start > region[1] and chunk_region_end > region[1])
+                or (chunk_region_start < region[0] and chunk_region_end < region[0])
+            ):
+                return False
+        return True
+
+
 class ChunkRegionSearch(object):
     def __init__(self, gm) -> None:
         self.gm = gm
         self.index_tracer = IndexTracer(list(gm.graph.nodes))
         self.index_tracer.trace_index()
         self.memory_estimator = MemoryEstimator(self.index_tracer)
+        self.chunk_selector = ChunkSelector(self.index_tracer, stratge="min_memory")
 
     def _find_peak_node(self, mem_peak):
         max_value = max(mem_peak)
@@ -1516,36 +1564,6 @@ def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
                     possible_chunk_region.extend(chunk_info)
         return possible_chunk_region
 
-    def _search_best_chunk_region(self, possible_chunk_regions, chunk_infos):
-        max_region_range = 0
-        best_region = None
-        while len(possible_chunk_regions) > 0:
-            for i in possible_chunk_regions:
-                if i["region"][1] - i["region"][0] > max_region_range:
-                    best_region = i
-                    max_region_range = i["region"][1] - i["region"][0]
-            if self._is_legal_region(best_region, chunk_infos):
-                break
-            possible_chunk_regions.remove(i)
-            max_region_range = 0
-            best_region = None
-        return best_region
-
-    def _is_legal_region(self, cur_chunk_info, chunk_infos):
-        (chunk_region_start, chunk_region_end) = cur_chunk_info["region"]
-        if cur_chunk_info in chunk_infos:
-            return False
-        if chunk_region_end < chunk_region_start:
-            return False
-        for i in chunk_infos:
-            region = i["region"]
-            if not (
-                (chunk_region_start > region[1] and chunk_region_end > region[1])
-                or (chunk_region_start < region[0] and chunk_region_end < region[0])
-            ):
-                return False
-        return True
-
     def _step_search(self, mem_peak, active_node, chunk_regions):
         peak_node = self._find_peak_node(mem_peak)
         max_chunk_region = self._search_max_chunk_region(
@@ -1556,7 +1574,7 @@ def _step_search(self, mem_peak, active_node, chunk_regions):
         possible_chunk_regions = self._search_possible_chunk_regions(
             max_chunk_region, peak_node
         )
-        best_chunk_region = self._search_best_chunk_region(
+        best_chunk_region = self.chunk_selector._select_best_chunk_region(
             possible_chunk_regions, chunk_regions
         )
         best_chunk_region = self.index_tracer.reorder_all(best_chunk_region)

From 8f5a0edfab3d9c4636333cba2dcdbb7f2fa74181 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 26 Dec 2022 23:08:49 +0800
Subject: [PATCH 059/209] add chunk select

---
 chunk_codegen.py | 147 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 112 insertions(+), 35 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index cdd0b1077487..330f3dec611c 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -69,7 +69,7 @@ def __init__(self, node_list) -> None:
         self.node_list = node_list
         self.idx_trace_list = self._init_idx_trace_list()
         self.idx_trace_equal = []
-        self.idx_view_list = []
+        self.idx_view_list = {}
         self.idx_count = -1
         self.all_reorder_map = {i: i for i in range(len(self.idx_trace_list))}
 
@@ -576,7 +576,7 @@ def _assign_view_reshape_index(self, node, node_idx):
             "idx_to": [self.idx_trace_list[node_idx]["idx"][i] for i in dim_to],
             "dim_to": dim_to,
         }
-        self.idx_view_list.append(view_dict)
+        self.idx_view_list[node] = view_dict
 
     def _merge_equal_idx(self):
         idx_equal = copy.deepcopy(self.idx_trace_equal)
@@ -702,7 +702,7 @@ def _find_inherit_dim(self, input_node, input_dim, node):
         for node_dim in range(len(_get_node_shape(node))):
             if (
                 input_node_idx in node_trace_source[node_dim]
-                and input_dim in node_trace_source[node_dim][input_node_idx]
+                and input_dim[0] in node_trace_source[node_dim][input_node_idx]
             ):
                 return node_dim
         return None
@@ -875,6 +875,7 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
         remove_inputs = []
         for input_node in inputs:
             input_dict = {}
+            input_node_idx = _find_idx_by_name(input_node.name, self.node_list)
             for user in input_node.users.keys():
                 if _is_non_compute_node(user):
                     continue
@@ -882,7 +883,11 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
                 if start_idx <= user_idx <= end_idx:
                     chunk_dim = all_node_info[user]["chunk_dim"]
                     if chunk_dim is not None:
-                        input_dict[user_idx] = chunk_dim
+                        user_source = self._find_source_trace_from_node(user)[chunk_dim]
+                        if input_node_idx in user_source:
+                            input_dict[user_idx] = user_source[input_node_idx]
+                        else:
+                            return None
             if len(input_dict) == 0:
                 remove_inputs.append(input_node)
             else:
@@ -898,6 +903,7 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
             "inputs_dim": inputs_dim,
             "outputs": outputs,
             "outputs_dim": end_dim,
+            "node_chunk_dim": all_node_info,
             "args": {},
         }
 
@@ -974,6 +980,26 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
             if i not in chunk_info["inputs"]:
                 chunk_info["inputs_non_chunk"].append(i)
 
+        # reassgin reshape size, some size may have changed due to chunk
+        chunk_info = self._reassgin_reshape_size(chunk_info)
+        
+        return chunk_info
+    
+    def _reassgin_reshape_size(self, chunk_info):
+        chunk_region = chunk_info['region']
+        reshape_size = {}
+        for node in self.node_list[chunk_region[0]: chunk_region[1] + 1]:
+            if any(i in node.name for i in ['reshape', 'view']):
+                reshape_args = node.args[1:]
+                reshape_log = self.idx_view_list[node]
+                chunk_dim = chunk_info['node_chunk_dim'][node]['chunk_dim']
+                reshape_size[node.name] = {}
+                for reshape_arg_dim, reshape_arg in enumerate(reshape_args):
+                    if reshape_arg_dim in reshape_log['dim_to']:
+                        continue
+                    if reshape_arg_dim == chunk_dim:
+                        reshape_size[node.name][reshape_arg.name] = "chunk_size"
+        chunk_info['reshape_size'] = reshape_size     
         return chunk_info
 
     def _get_reorder_map(self, chunk_info):
@@ -1183,23 +1209,15 @@ def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
                 not_contiguous_list.append(node)
         return mem
 
-    def _get_chunk_ratio(self, node, chunk_inputs, chunk_inputs_dim, chunk_size):
+    def _get_chunk_ratio(self, node, chunk_node_dim, chunk_size):
+        if node not in chunk_node_dim:
+            return 1.0
         node_shape = _get_node_shape(node)
-        node_source = self.index_tracer._find_source_trace_from_node(node)
-        for (input_node, input_node_dim) in zip(chunk_inputs, chunk_inputs_dim):
-            for k, v in input_node_dim.items():
-                # TODO: inherit dim should be list too, int now
-                inherit_dim = self.index_tracer._find_inherit_dim(
-                    input_node, v, self.index_tracer.node_list[k]
-                )
-                if k == _find_idx_by_name(node.name, self.index_tracer.node_list):
-                    chunk_ratio = float(chunk_size) / node_shape[inherit_dim]
-                    return chunk_ratio
-                for dim, source in enumerate(node_source):
-                    if k in source and inherit_dim in source[k]:
-                        chunk_ratio = float(chunk_size) / node_shape[dim]
-                        return chunk_ratio
-        return 1.0
+        chunk_dim = chunk_node_dim[node]['chunk_dim']
+        if chunk_dim is None:
+            return 1.0
+        else:
+            return float(chunk_size) / node_shape[chunk_dim]
 
     def _get_chunk_delete_node_size(
         self, user, user_to_last_uses, chunk_ratio, chunk_inputs_names
@@ -1242,6 +1260,7 @@ def estimate_chunk_inference_mem(
         self,
         node_list,
         chunk_infos=None,
+        print_mem=False,
     ):
         act_memory = 0.0
         act_memory_peak_log = []
@@ -1271,6 +1290,7 @@ def estimate_chunk_inference_mem(
                 j.name for i in chunk_inputs_non_chunk for j in i
             ]
             chunk_outputs = [i["outputs"][0] for i in chunk_infos]
+            chunk_node_dim = [i["node_chunk_dim"] for i in chunk_infos]
 
         for idx, node in enumerate(node_list):
             # if node in chunk start nodes, change chunk ratio and add chunk_tensor
@@ -1285,8 +1305,7 @@ def estimate_chunk_inference_mem(
             if chunk_within:
                 chunk_ratio = self._get_chunk_ratio(
                     node,
-                    chunk_inputs[chunk_region_idx],
-                    chunk_inputs_dim[chunk_region_idx],
+                    chunk_node_dim[chunk_region_idx],
                     chunk_size,
                 )
 
@@ -1357,11 +1376,12 @@ def estimate_chunk_inference_mem(
             act_memory_after_node_log.append(act_memory)
             active_node_list_log.append(copy.deepcopy(active_node_list))
 
-        print("with chunk" if use_chunk else "without chunk")
-        # self._print_mem_log(act_memory_peak_log, node_list, "peak")
-        # self._print_mem_log(act_memory_after_node_log, node_list, "after")
-        self._print_compute_op_mem_log(act_memory_peak_log, node_list, "peak")
-        self._print_compute_op_mem_log(act_memory_after_node_log, node_list, "after")
+        if print_mem:
+            print("with chunk" if use_chunk else "without chunk")
+            # self._print_mem_log(act_memory_peak_log, node_list, "peak")
+            # self._print_mem_log(act_memory_after_node_log, node_list, "after")
+            self._print_compute_op_mem_log(act_memory_peak_log, node_list, "peak")
+            self._print_compute_op_mem_log(act_memory_after_node_log, node_list, "after")
 
         # param_memory = parameter_size(gm)
         # all_memory = act_memory + param_memory
@@ -1369,21 +1389,70 @@ def estimate_chunk_inference_mem(
 
 
 class ChunkSelector(object):
-    def __init__(self, index_tracer: IndexTracer, stratge) -> None:
+    def __init__(self, index_tracer: IndexTracer, memory_estimator: MemoryEstimator, stratge):
         self.index_tracer = index_tracer
+        self.memory_estimator = memory_estimator
         assert stratge in ['min_memory', 'fit_memory']
         self.stratge = stratge
-        self.max_memory = 800  # MB
+        self.max_memory = 600  # MB
     
-    def _select_best_chunk_region(self, possible_chunk_regions, chunk_infos):
+    def _select_best_chunk_region(self, possible_chunk_regions, 
+            chunk_infos, peak_node, max_chunk_region, mem_peak):
         if self.stratge == 'min_memory':
             best_region = self._select_min_memory_chunk_region(possible_chunk_regions, chunk_infos)
         elif self.stratge == 'fit_memory':
-            pass
+            best_region = self._select_fit_memory_chunk_region(
+                possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak)
         else:
             raise RuntimeError()
         return best_region
     
+    def _select_fit_memory_chunk_region(self, possible_chunk_regions, 
+            chunk_infos, peak_node, max_chunk_region, mem_peak):
+        # stop chunk if max memory satisfy memory limit
+        if max(mem_peak) < self.max_memory:
+            return None
+        
+        # remove illegal regions
+        illegal_regions = []
+        for i in possible_chunk_regions:
+            if not self._is_legal_region(i, chunk_infos):
+                illegal_regions.append(i)
+        for i in illegal_regions:
+            if i in possible_chunk_regions:
+                possible_chunk_regions.remove(i)
+        
+        # get mem for chunk region
+        regions_dict = []
+        for region in possible_chunk_regions:
+            cur_chunk_infos = chunk_infos + [region]
+            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
+                self.index_tracer.node_list, cur_chunk_infos)[0]
+            cur_chunk_region_peak = cur_mem_peak[max_chunk_region[0]: max_chunk_region[1] + 1]
+            cur_chunk_region_max_peak = max(cur_chunk_region_peak)
+            if cur_chunk_region_max_peak < self.max_memory:
+                regions_dict.append({
+                    "chunk_info": region,
+                    "chunk_max_mem": cur_chunk_region_max_peak,
+                    "chunk_len": self._get_compute_node_num(region['region'][0], region['region'][1]),
+                })
+        # no region found
+        if len(regions_dict) == 0:
+            return None
+        
+        # select the min chunk len
+        chunk_len = [i["chunk_len"] for i in regions_dict]
+        best_region_idx = chunk_len.index(min(chunk_len))
+        best_region = regions_dict[best_region_idx]["chunk_info"]
+        return best_region
+    
+    def _get_compute_node_num(self, start, end):
+        count = 0
+        for i in self.index_tracer.node_list[start: end+1]:
+            if _is_non_compute_node(i):
+                count += 1
+        return count
+    
     def _select_min_memory_chunk_region(self, possible_chunk_regions, chunk_infos):
         max_region_range = 0
         best_region = None
@@ -1421,7 +1490,7 @@ def __init__(self, gm) -> None:
         self.index_tracer = IndexTracer(list(gm.graph.nodes))
         self.index_tracer.trace_index()
         self.memory_estimator = MemoryEstimator(self.index_tracer)
-        self.chunk_selector = ChunkSelector(self.index_tracer, stratge="min_memory")
+        self.chunk_selector = ChunkSelector(self.index_tracer, self.memory_estimator, stratge="fit_memory")
 
     def _find_peak_node(self, mem_peak):
         max_value = max(mem_peak)
@@ -1575,7 +1644,7 @@ def _step_search(self, mem_peak, active_node, chunk_regions):
             max_chunk_region, peak_node
         )
         best_chunk_region = self.chunk_selector._select_best_chunk_region(
-            possible_chunk_regions, chunk_regions
+            possible_chunk_regions, chunk_regions, peak_node, max_chunk_region, mem_peak
         )
         best_chunk_region = self.index_tracer.reorder_all(best_chunk_region)
         return best_chunk_region
@@ -1608,7 +1677,7 @@ def search_region(self):
                 _,
                 active_node,
             ) = self.memory_estimator.estimate_chunk_inference_mem(
-                self.index_tracer.node_list, chunk_infos
+                self.index_tracer.node_list, chunk_infos, print_mem=True
             )
             if self._stop_search(init_mem_peak, mem_peak):
                 break
@@ -1736,6 +1805,13 @@ def _replace_name(context, name_from, name_to):
     return context
 
 
+def _replace_reshape_size(context, node_name, reshape_size_dict):
+    if node_name not in reshape_size_dict:
+        return context
+    for size_name, size_value in reshape_size_dict[node_name].items():  
+        context = context.replace(size_name, size_value)
+    return context
+
 def emit_code_with_chunk(
     body,
     ckpt_func,
@@ -1802,11 +1878,12 @@ def emit_code_with_chunk(
                 for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
                     if idx == node_idx:
                         chunk_slice = _gen_chunk_slice_dim(
-                            dim, "chunk_idx", _get_node_shape(input_node)
+                            dim[0], "chunk_idx", _get_node_shape(input_node)
                         )
                         body[-1] = _replace_name(
                             body[-1], input_node.name, input_node.name + chunk_slice
                         )
+            body[-1] = _replace_reshape_size(body[-1], node.name, chunk_search[region_idx]['reshape_size'])
             body[-1] = "    " + body[-1]
             delete_unused_value_func(node, body, chunk_inputs_names)
         else:

From 378a49dc6c259773cdc198841a75137f7c6edc7f Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 27 Dec 2022 09:48:59 +0800
Subject: [PATCH 060/209] code style

---
 chunk_codegen.py | 101 +++++++++++++++++++++++++++++------------------
 1 file changed, 63 insertions(+), 38 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 330f3dec611c..1255852d777d 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -982,24 +982,24 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
 
         # reassgin reshape size, some size may have changed due to chunk
         chunk_info = self._reassgin_reshape_size(chunk_info)
-        
+
         return chunk_info
-    
+
     def _reassgin_reshape_size(self, chunk_info):
-        chunk_region = chunk_info['region']
+        chunk_region = chunk_info["region"]
         reshape_size = {}
-        for node in self.node_list[chunk_region[0]: chunk_region[1] + 1]:
-            if any(i in node.name for i in ['reshape', 'view']):
+        for node in self.node_list[chunk_region[0] : chunk_region[1] + 1]:
+            if any(i in node.name for i in ["reshape", "view"]):
                 reshape_args = node.args[1:]
                 reshape_log = self.idx_view_list[node]
-                chunk_dim = chunk_info['node_chunk_dim'][node]['chunk_dim']
+                chunk_dim = chunk_info["node_chunk_dim"][node]["chunk_dim"]
                 reshape_size[node.name] = {}
                 for reshape_arg_dim, reshape_arg in enumerate(reshape_args):
-                    if reshape_arg_dim in reshape_log['dim_to']:
+                    if reshape_arg_dim in reshape_log["dim_to"]:
                         continue
                     if reshape_arg_dim == chunk_dim:
                         reshape_size[node.name][reshape_arg.name] = "chunk_size"
-        chunk_info['reshape_size'] = reshape_size     
+        chunk_info["reshape_size"] = reshape_size
         return chunk_info
 
     def _get_reorder_map(self, chunk_info):
@@ -1213,7 +1213,7 @@ def _get_chunk_ratio(self, node, chunk_node_dim, chunk_size):
         if node not in chunk_node_dim:
             return 1.0
         node_shape = _get_node_shape(node)
-        chunk_dim = chunk_node_dim[node]['chunk_dim']
+        chunk_dim = chunk_node_dim[node]["chunk_dim"]
         if chunk_dim is None:
             return 1.0
         else:
@@ -1381,7 +1381,9 @@ def estimate_chunk_inference_mem(
             # self._print_mem_log(act_memory_peak_log, node_list, "peak")
             # self._print_mem_log(act_memory_after_node_log, node_list, "after")
             self._print_compute_op_mem_log(act_memory_peak_log, node_list, "peak")
-            self._print_compute_op_mem_log(act_memory_after_node_log, node_list, "after")
+            self._print_compute_op_mem_log(
+                act_memory_after_node_log, node_list, "after"
+            )
 
         # param_memory = parameter_size(gm)
         # all_memory = act_memory + param_memory
@@ -1389,30 +1391,41 @@ def estimate_chunk_inference_mem(
 
 
 class ChunkSelector(object):
-    def __init__(self, index_tracer: IndexTracer, memory_estimator: MemoryEstimator, stratge):
+    def __init__(
+        self, index_tracer: IndexTracer, memory_estimator: MemoryEstimator, stratge
+    ):
         self.index_tracer = index_tracer
         self.memory_estimator = memory_estimator
-        assert stratge in ['min_memory', 'fit_memory']
+        assert stratge in ["min_memory", "fit_memory"]
         self.stratge = stratge
         self.max_memory = 600  # MB
-    
-    def _select_best_chunk_region(self, possible_chunk_regions, 
-            chunk_infos, peak_node, max_chunk_region, mem_peak):
-        if self.stratge == 'min_memory':
-            best_region = self._select_min_memory_chunk_region(possible_chunk_regions, chunk_infos)
-        elif self.stratge == 'fit_memory':
+
+    def _select_best_chunk_region(
+        self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
+    ):
+        if self.stratge == "min_memory":
+            best_region = self._select_min_memory_chunk_region(
+                possible_chunk_regions, chunk_infos
+            )
+        elif self.stratge == "fit_memory":
             best_region = self._select_fit_memory_chunk_region(
-                possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak)
+                possible_chunk_regions,
+                chunk_infos,
+                peak_node,
+                max_chunk_region,
+                mem_peak,
+            )
         else:
             raise RuntimeError()
         return best_region
-    
-    def _select_fit_memory_chunk_region(self, possible_chunk_regions, 
-            chunk_infos, peak_node, max_chunk_region, mem_peak):
+
+    def _select_fit_memory_chunk_region(
+        self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
+    ):
         # stop chunk if max memory satisfy memory limit
         if max(mem_peak) < self.max_memory:
             return None
-        
+
         # remove illegal regions
         illegal_regions = []
         for i in possible_chunk_regions:
@@ -1421,38 +1434,45 @@ def _select_fit_memory_chunk_region(self, possible_chunk_regions,
         for i in illegal_regions:
             if i in possible_chunk_regions:
                 possible_chunk_regions.remove(i)
-        
+
         # get mem for chunk region
         regions_dict = []
         for region in possible_chunk_regions:
             cur_chunk_infos = chunk_infos + [region]
             cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
-                self.index_tracer.node_list, cur_chunk_infos)[0]
-            cur_chunk_region_peak = cur_mem_peak[max_chunk_region[0]: max_chunk_region[1] + 1]
+                self.index_tracer.node_list, cur_chunk_infos
+            )[0]
+            cur_chunk_region_peak = cur_mem_peak[
+                max_chunk_region[0] : max_chunk_region[1] + 1
+            ]
             cur_chunk_region_max_peak = max(cur_chunk_region_peak)
             if cur_chunk_region_max_peak < self.max_memory:
-                regions_dict.append({
-                    "chunk_info": region,
-                    "chunk_max_mem": cur_chunk_region_max_peak,
-                    "chunk_len": self._get_compute_node_num(region['region'][0], region['region'][1]),
-                })
+                regions_dict.append(
+                    {
+                        "chunk_info": region,
+                        "chunk_max_mem": cur_chunk_region_max_peak,
+                        "chunk_len": self._get_compute_node_num(
+                            region["region"][0], region["region"][1]
+                        ),
+                    }
+                )
         # no region found
         if len(regions_dict) == 0:
             return None
-        
+
         # select the min chunk len
         chunk_len = [i["chunk_len"] for i in regions_dict]
         best_region_idx = chunk_len.index(min(chunk_len))
         best_region = regions_dict[best_region_idx]["chunk_info"]
         return best_region
-    
+
     def _get_compute_node_num(self, start, end):
         count = 0
-        for i in self.index_tracer.node_list[start: end+1]:
+        for i in self.index_tracer.node_list[start : end + 1]:
             if _is_non_compute_node(i):
                 count += 1
         return count
-    
+
     def _select_min_memory_chunk_region(self, possible_chunk_regions, chunk_infos):
         max_region_range = 0
         best_region = None
@@ -1490,7 +1510,9 @@ def __init__(self, gm) -> None:
         self.index_tracer = IndexTracer(list(gm.graph.nodes))
         self.index_tracer.trace_index()
         self.memory_estimator = MemoryEstimator(self.index_tracer)
-        self.chunk_selector = ChunkSelector(self.index_tracer, self.memory_estimator, stratge="fit_memory")
+        self.chunk_selector = ChunkSelector(
+            self.index_tracer, self.memory_estimator, stratge="fit_memory"
+        )
 
     def _find_peak_node(self, mem_peak):
         max_value = max(mem_peak)
@@ -1808,10 +1830,11 @@ def _replace_name(context, name_from, name_to):
 def _replace_reshape_size(context, node_name, reshape_size_dict):
     if node_name not in reshape_size_dict:
         return context
-    for size_name, size_value in reshape_size_dict[node_name].items():  
+    for size_name, size_value in reshape_size_dict[node_name].items():
         context = context.replace(size_name, size_value)
     return context
 
+
 def emit_code_with_chunk(
     body,
     ckpt_func,
@@ -1883,7 +1906,9 @@ def emit_code_with_chunk(
                         body[-1] = _replace_name(
                             body[-1], input_node.name, input_node.name + chunk_slice
                         )
-            body[-1] = _replace_reshape_size(body[-1], node.name, chunk_search[region_idx]['reshape_size'])
+            body[-1] = _replace_reshape_size(
+                body[-1], node.name, chunk_search[region_idx]["reshape_size"]
+            )
             body[-1] = "    " + body[-1]
             delete_unused_value_func(node, body, chunk_inputs_names)
         else:

From 6be89a3b82d370be152c93dd7277e234e68eaea6 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 27 Dec 2022 14:48:25 +0800
Subject: [PATCH 061/209] add chunksize in emit, fix bug in reassgin shape

---
 chunk_codegen.py | 56 ++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 52 insertions(+), 4 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 1255852d777d..470768855779 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -988,6 +988,7 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
     def _reassgin_reshape_size(self, chunk_info):
         chunk_region = chunk_info["region"]
         reshape_size = {}
+        chunk_shape = _get_node_shape(chunk_info["outputs"][0])[chunk_info["outputs_dim"]]
         for node in self.node_list[chunk_region[0] : chunk_region[1] + 1]:
             if any(i in node.name for i in ["reshape", "view"]):
                 reshape_args = node.args[1:]
@@ -998,7 +999,7 @@ def _reassgin_reshape_size(self, chunk_info):
                     if reshape_arg_dim in reshape_log["dim_to"]:
                         continue
                     if reshape_arg_dim == chunk_dim:
-                        reshape_size[node.name][reshape_arg.name] = "chunk_size"
+                        reshape_size[node.name][reshape_arg.name] = "min(chunk_size, %d - chunk_idx)" % chunk_shape
         chunk_info["reshape_size"] = reshape_size
         return chunk_info
 
@@ -1276,7 +1277,6 @@ def estimate_chunk_inference_mem(
         chunk_within = False
         chunk_region_idx = None
         chunk_ratio = 1  # use it to estimate chunk mem
-        chunk_size = 1
         chunk_inputs_names = []
 
         if use_chunk:
@@ -1285,12 +1285,14 @@ def estimate_chunk_inference_mem(
             chunk_ends = [i[1] for i in chunk_regions]
             chunk_inputs = [i["inputs"] for i in chunk_infos]
             chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos]
-            chunk_inputs_dim = [i["inputs_dim"] for i in chunk_infos]
             chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [
                 j.name for i in chunk_inputs_non_chunk for j in i
             ]
             chunk_outputs = [i["outputs"][0] for i in chunk_infos]
             chunk_node_dim = [i["node_chunk_dim"] for i in chunk_infos]
+            chunk_sizes = [
+                i["chunk_size"] if "chunk_size" in i else 1 for i in chunk_infos
+            ]
 
         for idx, node in enumerate(node_list):
             # if node in chunk start nodes, change chunk ratio and add chunk_tensor
@@ -1306,7 +1308,7 @@ def estimate_chunk_inference_mem(
                 chunk_ratio = self._get_chunk_ratio(
                     node,
                     chunk_node_dim[chunk_region_idx],
-                    chunk_size,
+                    chunk_sizes[chunk_region_idx],
                 )
 
             # if node is placeholder, just add the size of the node
@@ -1464,8 +1466,53 @@ def _select_fit_memory_chunk_region(
         chunk_len = [i["chunk_len"] for i in regions_dict]
         best_region_idx = chunk_len.index(min(chunk_len))
         best_region = regions_dict[best_region_idx]["chunk_info"]
+
+        # get max chunk size
+        best_region = self._get_fit_chunk_size(best_region, chunk_infos)
         return best_region
 
+    def _get_fit_chunk_size(self, chunk_info, chunk_infos):
+        chunk_size = 1
+        chunk_info["chunk_size"] = chunk_size
+        cur_chunk_max_mem = 0
+        # search a region
+        while cur_chunk_max_mem < self.max_memory:
+            chunk_size *= 2
+            chunk_info["chunk_size"] = chunk_size
+            cur_chunk_infos = chunk_infos + [chunk_info]
+            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
+                self.index_tracer.node_list, cur_chunk_infos
+            )[0]
+            cur_chunk_max_mem = max(
+                cur_mem_peak[chunk_info["region"][0] : chunk_info["region"][1] + 1]
+            )
+        # search exact size
+        chunk_info["chunk_size"] = self._chunk_size_binary_search(
+            chunk_size // 2, chunk_size, chunk_info, chunk_infos
+        )
+        return chunk_info
+
+    def _chunk_size_binary_search(self, l, r, chunk_info, chunk_infos):
+        if l >= 16:
+            gap = 4
+        else:
+            gap = 1
+        while r >= l + gap:
+            mid = int(l + (r - l)/2)
+            chunk_info["chunk_size"] = mid
+            cur_chunk_infos = chunk_infos + [chunk_info]
+            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
+                self.index_tracer.node_list, cur_chunk_infos
+            )[0]
+            cur_chunk_max_mem = max(
+                cur_mem_peak[chunk_info["region"][0] : chunk_info["region"][1] + 1]
+            )
+            if cur_chunk_max_mem >= self.max_memory:
+                r = mid - gap
+            else:
+                l = mid + gap
+        return l
+
     def _get_compute_node_num(self, start, end):
         count = 0
         for i in self.index_tracer.node_list[start : end + 1]:
@@ -1891,6 +1938,7 @@ def emit_code_with_chunk(
                     chunk_inputs[region_idx],
                     chunk_outputs[region_idx],
                     chunk_outputs_dim[region_idx],
+                    chunk_size=chunk_search[region_idx]["chunk_size"]
                 )
             )
 

From a2b4755ce96e2e8dea100bafd7790e22426aa548 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 27 Dec 2022 14:49:52 +0800
Subject: [PATCH 062/209] code style

---
 chunk_codegen.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 470768855779..3cd10350eaba 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -988,7 +988,9 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
     def _reassgin_reshape_size(self, chunk_info):
         chunk_region = chunk_info["region"]
         reshape_size = {}
-        chunk_shape = _get_node_shape(chunk_info["outputs"][0])[chunk_info["outputs_dim"]]
+        chunk_shape = _get_node_shape(chunk_info["outputs"][0])[
+            chunk_info["outputs_dim"]
+        ]
         for node in self.node_list[chunk_region[0] : chunk_region[1] + 1]:
             if any(i in node.name for i in ["reshape", "view"]):
                 reshape_args = node.args[1:]
@@ -999,7 +1001,9 @@ def _reassgin_reshape_size(self, chunk_info):
                     if reshape_arg_dim in reshape_log["dim_to"]:
                         continue
                     if reshape_arg_dim == chunk_dim:
-                        reshape_size[node.name][reshape_arg.name] = "min(chunk_size, %d - chunk_idx)" % chunk_shape
+                        reshape_size[node.name][reshape_arg.name] = (
+                            "min(chunk_size, %d - chunk_idx)" % chunk_shape
+                        )
         chunk_info["reshape_size"] = reshape_size
         return chunk_info
 
@@ -1498,7 +1502,7 @@ def _chunk_size_binary_search(self, l, r, chunk_info, chunk_infos):
         else:
             gap = 1
         while r >= l + gap:
-            mid = int(l + (r - l)/2)
+            mid = int(l + (r - l) / 2)
             chunk_info["chunk_size"] = mid
             cur_chunk_infos = chunk_infos + [chunk_info]
             cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
@@ -1938,7 +1942,7 @@ def emit_code_with_chunk(
                     chunk_inputs[region_idx],
                     chunk_outputs[region_idx],
                     chunk_outputs_dim[region_idx],
-                    chunk_size=chunk_search[region_idx]["chunk_size"]
+                    chunk_search[region_idx]["chunk_size"],
                 )
             )
 

From cb2dd1a10614c21ca78e1c0cea2f6f7aa882e712 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 27 Dec 2022 15:01:58 +0800
Subject: [PATCH 063/209] turn off print mem

---
 chunk_codegen.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 3cd10350eaba..6caed88d84d2 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1750,10 +1750,13 @@ def search_region(self):
                 _,
                 active_node,
             ) = self.memory_estimator.estimate_chunk_inference_mem(
-                self.index_tracer.node_list, chunk_infos, print_mem=True
+                self.index_tracer.node_list, chunk_infos
             )
             if self._stop_search(init_mem_peak, mem_peak):
                 break
+        # self.memory_estimator.estimate_chunk_inference_mem(
+        #     self.index_tracer.node_list, chunk_infos, print_mem=True
+        # )
         return chunk_infos
 
 
From 69af93107f09db3fb90116144296ebc20adc7b52 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 29 Dec 2022 11:28:25 +0800
Subject: [PATCH 064/209] add evoformer openfold init

---
 evoformer_openfold/evoformer.py   |  59 +++++++++
 evoformer_openfold/initializer.py |  29 +++++
 evoformer_openfold/kernel.py      |  19 +++
 evoformer_openfold/msa.py         |  95 +++++++++++++++
 evoformer_openfold/ops.py         | 176 +++++++++++++++++++++++++++
 evoformer_openfold/triangle.py    | 192 ++++++++++++++++++++++++++++++
 6 files changed, 570 insertions(+)
 create mode 100644 evoformer_openfold/evoformer.py
 create mode 100755 evoformer_openfold/initializer.py
 create mode 100644 evoformer_openfold/kernel.py
 create mode 100644 evoformer_openfold/msa.py
 create mode 100755 evoformer_openfold/ops.py
 create mode 100644 evoformer_openfold/triangle.py

diff --git a/evoformer_openfold/evoformer.py b/evoformer_openfold/evoformer.py
new file mode 100644
index 000000000000..cfd2bb2a2529
--- /dev/null
+++ b/evoformer_openfold/evoformer.py
@@ -0,0 +1,59 @@
+import torch
+import torch.nn as nn
+
+from .msa import MSAStack
+from .ops import OutProductMean
+from .triangle import PairStack
+
+
+def print_memory(init_mem, text=None):
+    now_mem = torch.cuda.memory_allocated() / 1024 ** 2 - init_mem
+    max_mem = torch.cuda.max_memory_allocated() / 1024 ** 2 - init_mem
+    print("%s now:%.2f max:%.2f" % ("" if text is None else text, now_mem, max_mem))
+    torch.cuda.reset_peak_memory_stats()
+
+
+class EvoformerBlock(nn.Module):
+
+    def __init__(self, d_node, d_pair):
+        super(EvoformerBlock, self).__init__()
+
+        self.msa_stack = MSAStack(d_node, d_pair, p_drop=0.15)
+        self.communication = OutProductMean(n_feat=d_node, n_feat_out=d_pair, n_feat_proj=32)
+        self.pair_stack = PairStack(d_pair=d_pair)
+
+    def forward(self, node, pair):
+        node = self.msa_stack(node, pair)
+        pair = pair + self.communication(node)
+        pair = self.pair_stack(pair)
+        return node, pair
+
+
+class Evoformer(nn.Module):
+
+    def __init__(self, d_node, d_pair):
+        super(Evoformer, self).__init__()
+
+        self.blocks = nn.ModuleList()
+        for _ in range(1):
+            self.blocks.append(EvoformerBlock(d_node, d_pair))
+
+    def forward(self, node, pair):
+        for b in self.blocks:
+            node, pair = b(node, pair)
+        return node, pair
+
+
+def evoformer_tiny():
+    return Evoformer(d_node=64, d_pair=32)
+
+
+def evoformer_base():
+    return Evoformer(d_node=256, d_pair=128)
+
+
+def evoformer_large():
+    return Evoformer(d_node=512, d_pair=256)
+
+
+__all__ = ['Evoformer', 'evoformer_base', 'evoformer_large']
diff --git a/evoformer_openfold/initializer.py b/evoformer_openfold/initializer.py
new file mode 100755
index 000000000000..c6ce0659e597
--- /dev/null
+++ b/evoformer_openfold/initializer.py
@@ -0,0 +1,29 @@
+import math
+
+import numpy as np
+import torch.nn as nn
+
+
+def glorot_uniform_af(x, gain=1.0):
+    """
+    initialize tensors the same as xavier_initializer in PyTorch, but the dimensions are different:
+    In PyTorch:
+    [feature_out, feature_in, n_head ...]
+    In Jax:
+    [... n_head, feature_in, feature_out]
+    However, there is a feature in original Alphafold2 code that they use the Jax version initializer to initialize tensors like:
+    [feature_in, n_head, feature_out]
+
+    In this function, we keep this feature to initialize [feature_in, n_head, ..., feature_out] tensors
+    """
+    fan_in, fan_out = x.shape[-2:]
+    if len(x.shape) > 2:
+        receptive_field_size = np.prod(x.shape[:-2])
+        fan_in *= receptive_field_size
+        fan_out *= receptive_field_size
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    dev = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
+
+    nn.init.uniform_(x, -dev, dev)
+
+    return x
diff --git a/evoformer_openfold/kernel.py b/evoformer_openfold/kernel.py
new file mode 100644
index 000000000000..26ab5dc53261
--- /dev/null
+++ b/evoformer_openfold/kernel.py
@@ -0,0 +1,19 @@
+import torch
+import torch.nn.functional as F
+
+
+def bias_sigmod_ele(y, bias, z):
+    return torch.sigmoid(y + bias) * z
+
+
+def bias_dropout_add(x: torch.Tensor, bias: torch.Tensor, dropmask: torch.Tensor,
+                     residual: torch.Tensor, prob: float) -> torch.Tensor:
+    out = (x + bias) * F.dropout(dropmask, p=prob, training=False)
+    out = residual + out
+    return out
+
+
+def bias_ele_dropout_residual(ab: torch.Tensor, b: torch.Tensor, g: torch.Tensor,
+                              dropout_mask: torch.Tensor, Z_raw: torch.Tensor,
+                              prob: float) -> torch.Tensor:
+    return Z_raw + F.dropout(dropout_mask, p=prob, training=True) * (g * (ab + b))
\ No newline at end of file
diff --git a/evoformer_openfold/msa.py b/evoformer_openfold/msa.py
new file mode 100644
index 000000000000..cac456638a55
--- /dev/null
+++ b/evoformer_openfold/msa.py
@@ -0,0 +1,95 @@
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn import LayerNorm
+
+from .kernel import bias_dropout_add
+from .ops import SelfAttention, Transition
+
+
+class MSARowAttentionWithPairBias(nn.Module):
+
+    def __init__(self, d_node, d_pair, c=32, n_head=8, p_drop=0.15):
+        super(MSARowAttentionWithPairBias, self).__init__()
+        self.d_node = d_node
+        self.d_pair = d_pair
+        self.c = c
+        self.n_head = n_head
+        self.p_drop = p_drop
+
+        self.layernormM = LayerNorm(d_node)
+        self.layernormZ = LayerNorm(d_pair)
+
+        _init_weights = torch.nn.init.normal_(torch.zeros([n_head, d_pair]),
+                                              std=1.0 / math.sqrt(d_pair))
+        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights, requires_grad=True)
+
+        self.attention = SelfAttention(qkv_dim=d_node,
+                                       c=c,
+                                       n_head=n_head,
+                                       out_dim=d_node,
+                                       gating=True,
+                                       last_bias_fuse=True)
+
+        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_node,)), requires_grad=True)
+
+    def forward(self, M_raw, Z):
+        ## Input projections
+        M = self.layernormM(M_raw)
+        Z = self.layernormZ(Z)
+        b = F.linear(Z, self.linear_b_weights)
+        b = b.permute(0, 3, 1, 2)
+        # b = rearrange(b, 'b q k h -> b h q k')
+
+        M = self.attention(M, b)
+        dropout_mask = torch.ones_like(M[:, 0:1, :, :]).to(M.device).to(M.dtype)
+
+        return bias_dropout_add(M, self.out_bias, dropout_mask, M_raw, prob=self.p_drop)
+
+
+class MSAColumnAttention(nn.Module):
+
+    def __init__(self, d_node, c=32, n_head=8):
+        super(MSAColumnAttention, self).__init__()
+        self.d_node = d_node
+        self.c = c
+        self.n_head = n_head
+
+        self.layernormM = LayerNorm(d_node)
+        self.attention = SelfAttention(qkv_dim=d_node,
+                                       c=c,
+                                       n_head=n_head,
+                                       out_dim=d_node,
+                                       gating=True)
+
+    def forward(self, M_raw):
+        M = M_raw.transpose(-2, -3)
+        M = self.layernormM(M)
+
+        M = self.attention(M)
+
+        M = M.transpose(-2, -3)
+        return M_raw + M
+
+
+class MSAStack(nn.Module):
+
+    def __init__(self, d_node, d_pair, p_drop=0.15):
+        super(MSAStack, self).__init__()
+
+        self.MSARowAttentionWithPairBias = MSARowAttentionWithPairBias(d_node=d_node,
+                                                                       d_pair=d_pair,
+                                                                       p_drop=p_drop)
+
+        self.MSAColumnAttention = MSAColumnAttention(d_node=d_node)
+        self.MSATransition = Transition(d=d_node)
+
+    def forward(self, node, pair):
+        node = self.MSARowAttentionWithPairBias(node, pair)
+        node = self.MSAColumnAttention(node)
+        node = self.MSATransition(node)
+
+        return node
diff --git a/evoformer_openfold/ops.py b/evoformer_openfold/ops.py
new file mode 100755
index 000000000000..611b7b0fe777
--- /dev/null
+++ b/evoformer_openfold/ops.py
@@ -0,0 +1,176 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn import LayerNorm
+
+from .initializer import glorot_uniform_af
+from .kernel import bias_sigmod_ele
+
+
+class DropoutRowwise(nn.Module):
+
+    def __init__(self, p):
+        super(DropoutRowwise, self).__init__()
+        self.p = p
+        self.dropout = nn.Dropout(p=p)
+
+    def forward(self, x):
+        dropout_mask = torch.ones_like(x[:, 0:1, :, :])
+        dropout_mask = self.dropout(dropout_mask)
+        return dropout_mask * x
+
+
+class DropoutColumnwise(nn.Module):
+
+    def __init__(self, p):
+        super(DropoutColumnwise, self).__init__()
+        self.p = p
+        self.dropout = nn.Dropout(p=p)
+
+    def forward(self, x):
+        dropout_mask = torch.ones_like(x[:, :, 0:1, :])
+        dropout_mask = self.dropout(dropout_mask)
+        return dropout_mask * x
+
+
+class Transition(nn.Module):
+
+    def __init__(self, d, n=4):
+        super(Transition, self).__init__()
+        self.norm = LayerNorm(d)
+        self.linear1 = Linear(d, n * d, initializer='relu')
+        self.linear2 = Linear(n * d, d, initializer='zeros')
+
+    def forward(self, src):
+        x = self.norm(src)
+        x = self.linear2(F.relu(self.linear1(x)))
+        return src + x
+
+
+class OutProductMean(nn.Module):
+
+    def __init__(self, n_feat=64, n_feat_out=128, n_feat_proj=32):
+        super(OutProductMean, self).__init__()
+
+        self.layernormM = LayerNorm(n_feat)
+        self.linear_a = Linear(n_feat, n_feat_proj)
+        self.linear_b = Linear(n_feat, n_feat_proj)
+
+        self.o_linear = Linear(n_feat_proj * n_feat_proj,
+                               n_feat_out,
+                               initializer='zero',
+                               use_bias=True)
+
+    def forward(self, M):
+        M = self.layernormM(M)
+        left_act = self.linear_a(M)
+        right_act = self.linear_b(M)
+
+        O = torch.einsum('bsid,bsje->bijde', left_act, right_act).contiguous()
+        # O = rearrange(O, 'b i j d e -> b i j (d e)')
+        O = O.reshape(O.shape[0], O.shape[1], O.shape[2], -1)
+        Z = self.o_linear(O)
+
+        return Z
+
+
+class Linear(nn.Linear):
+    """
+    A Linear layer with built-in nonstandard initializations. Called just
+    like torch.nn.Linear.
+    Implements the initializers in 1.11.4, plus some additional ones found
+    in the code.
+    """
+
+    def __init__(
+        self,
+        feature_in: int,
+        feature_out: int,
+        initializer: str = 'linear',
+        use_bias: bool = True,
+        bias_init: float = 0.,
+    ):
+        super(Linear, self).__init__(feature_in, feature_out, bias=use_bias)
+
+        self.use_bias = use_bias
+        if initializer == 'linear':
+            glorot_uniform_af(self.weight, gain=1.0)
+        elif initializer == 'relu':
+            glorot_uniform_af(self.weight, gain=2.0)
+        elif initializer == 'zeros':
+            nn.init.zeros_(self.weight)
+        if self.use_bias:
+            with torch.no_grad():
+                self.bias.fill_(bias_init)
+
+
+class SelfAttention(nn.Module):
+    """
+    Multi-Head SelfAttention dealing with [batch_size1, batch_size2, len, dim] tensors
+    """
+
+    def __init__(self, qkv_dim, c, n_head, out_dim, gating=True, last_bias_fuse=False):
+        super(SelfAttention, self).__init__()
+        self.qkv_dim = qkv_dim
+        self.c = c
+        self.n_head = n_head
+        self.out_dim = out_dim
+        self.gating = gating
+        self.last_bias_fuse = last_bias_fuse
+
+        self.scaling = self.c**(-0.5)
+
+        # self.to_qkv = Linear(qkv_dim, 3 * n_head * c, initializer='linear')
+        self.to_q = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
+        self.to_k = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
+        self.to_v = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
+
+        if gating:
+            self.gating_bias = nn.parameter.Parameter(data=torch.ones((n_head * c,)))
+            self.gating_linear = Linear(qkv_dim, n_head * c, initializer='zero', use_bias=False)
+
+        self.o_linear = Linear(n_head * c,
+                               out_dim,
+                               initializer='zero',
+                               use_bias=(not last_bias_fuse))
+
+    def forward(self, in_data, nonbatched_bias=None):
+        """
+        :param in_data: [batch_size1, batch_size2, len_qkv, qkv_dim]
+        :param bias: None or [batch_size1, batch_size2, n_head, len_q, len_kv]
+        :param nonbatched_bias: None or [batch_size1, n_head, len_q, len_kv]
+        """
+
+        # qkv = self.to_qkv(in_data).chunk(3, dim=-1)
+        # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head), qkv)
+
+        q = self.to_q(in_data)
+        k = self.to_k(in_data)
+        v = self.to_v(in_data)
+
+        # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head),
+        #               [q, k, v])
+        q, k, v = map(lambda t: t.view(t.shape[0], t.shape[1], t.shape[2], self.n_head, -1).permute(0, 1, 3, 2, 4),
+                      [q, k, v])
+        
+        q = q * self.scaling
+
+        logits = torch.matmul(q, k.transpose(-1, -2))
+
+        if nonbatched_bias is not None:
+            logits += nonbatched_bias.unsqueeze(1)
+        weights = torch.softmax(logits, dim=-1)
+        # weights = softmax(logits)
+
+        weighted_avg = torch.matmul(weights, v)
+        # weighted_avg = rearrange(weighted_avg, 'b1 b2 h n d -> b1 b2 n (h d)')
+        weighted_avg = weighted_avg.permute(0, 1, 3, 2, 4)
+        weighted_avg = weighted_avg.reshape(weighted_avg.shape[0], weighted_avg.shape[1], weighted_avg.shape[2], -1)
+
+        if self.gating:
+            gate_values = self.gating_linear(in_data)
+            weighted_avg = bias_sigmod_ele(gate_values, self.gating_bias, weighted_avg)
+
+        output = self.o_linear(weighted_avg)
+        return output
diff --git a/evoformer_openfold/triangle.py b/evoformer_openfold/triangle.py
new file mode 100644
index 000000000000..f479469c3836
--- /dev/null
+++ b/evoformer_openfold/triangle.py
@@ -0,0 +1,192 @@
+import math
+
+import torch
+import torch.nn as nn
+from torch.nn import LayerNorm
+
+from .kernel import bias_dropout_add, bias_ele_dropout_residual
+from .ops import Linear, SelfAttention, Transition
+
+
+def permute_final_dims(tensor, inds):
+    zero_index = -1 * len(inds)
+    first_inds = list(range(len(tensor.shape[:zero_index])))
+    return tensor.permute(first_inds + [zero_index + i for i in inds])
+
+
+class TriangleMultiplicationOutgoing(nn.Module):
+
+    def __init__(self, d_pair, p_drop, c=128):
+        super(TriangleMultiplicationOutgoing, self).__init__()
+        self.d_pair = d_pair
+        self.c = c
+
+        self.layernorm1 = LayerNorm(d_pair)
+        self.left_projection = Linear(d_pair, c)
+        self.right_projection = Linear(d_pair, c)
+        self.left_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
+        self.right_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
+
+        self.output_gate = Linear(d_pair, d_pair, initializer='zeros', bias_init=1.)
+        self.layernorm2 = LayerNorm(c)
+        self.output_projection = Linear(d_pair, d_pair, initializer='zeros', use_bias=False)
+        self.output_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
+
+        self.p_drop = p_drop
+
+    def forward(self, Z_raw):
+        Z = self.layernorm1(Z_raw)
+        left_proj_act = self.left_projection(Z)
+        right_proj_act = self.right_projection(Z)
+
+        left_proj_act = left_proj_act * torch.sigmoid(self.left_gate(Z))
+        right_proj_act = right_proj_act * torch.sigmoid(self.right_gate(Z))
+
+        g = torch.sigmoid(self.output_gate(Z))
+        # p = torch.matmul(
+        #     permute_final_dims(left_proj_act, (2, 0, 1)),
+        #     permute_final_dims(right_proj_act, (2, 1, 0)),
+        # )
+        # ab = permute_final_dims(p, (1, 2, 0))
+
+        ab = torch.einsum('bikd,bjkd->bijd', left_proj_act, right_proj_act)
+        ab = self.output_projection(self.layernorm2(ab))
+        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
+        return bias_ele_dropout_residual(ab,
+                                         self.output_bias,
+                                         g,
+                                         dropout_mask,
+                                         Z_raw,
+                                         prob=self.p_drop)
+
+
+class TriangleMultiplicationIncoming(nn.Module):
+
+    def __init__(self, d_pair, p_drop, c=128):
+        super(TriangleMultiplicationIncoming, self).__init__()
+        self.d_pair = d_pair
+        self.c = c
+
+        self.layernorm1 = LayerNorm(d_pair)
+        self.left_projection = Linear(d_pair, c)
+        self.right_projection = Linear(d_pair, c)
+        self.left_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
+        self.right_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
+
+        self.output_gate = Linear(d_pair, d_pair, initializer='zeros', bias_init=1.)
+        self.layernorm2 = LayerNorm(c)
+        self.output_projection = Linear(d_pair, d_pair, initializer='zeros', use_bias=False)
+        self.output_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
+
+        self.p_drop = p_drop
+
+    def forward(self, Z_raw):
+        Z = self.layernorm1(Z_raw)
+        left_proj_act = self.left_projection(Z)
+        right_proj_act = self.right_projection(Z)
+
+        left_proj_act = left_proj_act * torch.sigmoid(self.left_gate(Z))
+        right_proj_act = right_proj_act * torch.sigmoid(self.right_gate(Z))
+
+        g = torch.sigmoid(self.output_gate(Z))
+        # p = torch.matmul(
+        #     permute_final_dims(left_proj_act, (2, 1, 0)),
+        #     permute_final_dims(right_proj_act, (2, 0, 1)),
+        # )
+        # ab = permute_final_dims(p, (1, 2, 0))
+
+        ab = torch.einsum('bkid,bkjd->bijd', left_proj_act, right_proj_act)
+        ab = self.output_projection(self.layernorm2(ab))
+        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
+        return bias_ele_dropout_residual(ab,
+                                         self.output_bias,
+                                         g,
+                                         dropout_mask,
+                                         Z_raw,
+                                         prob=self.p_drop)
+
+
+class TriangleAttentionStartingNode(nn.Module):
+
+    def __init__(self, d_pair, p_drop, c=32, n_head=4):
+        super(TriangleAttentionStartingNode, self).__init__()
+        self.d_pair = d_pair
+        self.c = c
+        self.n_head = n_head
+        self.p_drop = p_drop
+
+        self.layernorm1 = LayerNorm(d_pair)
+        _init_weights = torch.nn.init.normal_(torch.zeros([d_pair, n_head]),
+                                              std=1.0 / math.sqrt(d_pair))
+        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights)
+        self.attention = SelfAttention(qkv_dim=d_pair,
+                                       c=c,
+                                       n_head=n_head,
+                                       out_dim=d_pair,
+                                       gating=True,
+                                       last_bias_fuse=True)
+
+        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
+
+    def forward(self, Z_raw):
+        Z = self.layernorm1(Z_raw)
+        b = torch.einsum('bqkc,ch->bhqk', Z, self.linear_b_weights)
+
+        Z = self.attention(Z, b)
+
+        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
+        return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
+
+
+class TriangleAttentionEndingNode(nn.Module):
+
+    def __init__(self, d_pair, p_drop, c=32, n_head=4):
+        super(TriangleAttentionEndingNode, self).__init__()
+        self.d_pair = d_pair
+        self.c = c
+        self.n_head = n_head
+        self.p_drop = p_drop
+
+        self.layernorm1 = LayerNorm(d_pair)
+        _init_weights = torch.nn.init.normal_(torch.zeros([d_pair, n_head]),
+                                              std=1.0 / math.sqrt(d_pair))
+        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights)
+        self.attention = SelfAttention(qkv_dim=d_pair,
+                                       c=c,
+                                       n_head=n_head,
+                                       out_dim=d_pair,
+                                       gating=True,
+                                       last_bias_fuse=True)
+
+        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
+
+    def forward(self, Z_raw):
+        Z = Z_raw.transpose(-2, -3)
+        Z = self.layernorm1(Z)
+        b = torch.einsum('bqkc,ch->bhqk', Z, self.linear_b_weights)
+
+        Z = self.attention(Z, b)
+
+        Z = Z.transpose(-2, -3)
+        dropout_mask = torch.ones_like(Z[:, :, 0:1, :]).to(Z.device).to(Z.dtype)
+        return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
+
+
+class PairStack(nn.Module):
+
+    def __init__(self, d_pair, p_drop=0.25):
+        super(PairStack, self).__init__()
+
+        self.TriangleMultiplicationOutgoing = TriangleMultiplicationOutgoing(d_pair, p_drop=p_drop)
+        self.TriangleMultiplicationIncoming = TriangleMultiplicationIncoming(d_pair, p_drop=p_drop)
+        self.TriangleAttentionStartingNode = TriangleAttentionStartingNode(d_pair, p_drop=p_drop)
+        self.TriangleAttentionEndingNode = TriangleAttentionEndingNode(d_pair, p_drop=p_drop)
+        self.PairTransition = Transition(d=d_pair)
+
+    def forward(self, pair):
+        pair = self.TriangleMultiplicationOutgoing(pair)
+        pair = self.TriangleMultiplicationIncoming(pair)
+        pair = self.TriangleAttentionStartingNode(pair)
+        pair = self.TriangleAttentionEndingNode(pair)
+        pair = self.PairTransition(pair)
+        return pair

From fff493c2021a55754d574cc1457cb4c695e30354 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 29 Dec 2022 11:48:11 +0800
Subject: [PATCH 065/209] init openfold

---
 evoformer_openfold/evoformer.py              |  59 --
 evoformer_openfold/initializer.py            |  29 -
 evoformer_openfold/kernel.py                 |  19 -
 evoformer_openfold/msa.py                    |  95 ---
 evoformer_openfold/ops.py                    | 176 -----
 evoformer_openfold/triangle.py               | 192 ------
 openfold/checkpointing.py                    |  84 +++
 openfold/dropout.py                          |  78 +++
 openfold/evoformer.py                        | 636 +++++++++++++++++++
 openfold/msa.py                              | 392 ++++++++++++
 openfold/outer_product_mean.py               | 129 ++++
 openfold/pair_transition.py                  |  99 +++
 openfold/primitives.py                       | 529 +++++++++++++++
 openfold/tensor_utils.py                     | 408 ++++++++++++
 openfold/triangular_attention.py             | 139 ++++
 openfold/triangular_multiplicative_update.py | 127 ++++
 16 files changed, 2621 insertions(+), 570 deletions(-)
 delete mode 100644 evoformer_openfold/evoformer.py
 delete mode 100755 evoformer_openfold/initializer.py
 delete mode 100644 evoformer_openfold/kernel.py
 delete mode 100644 evoformer_openfold/msa.py
 delete mode 100755 evoformer_openfold/ops.py
 delete mode 100644 evoformer_openfold/triangle.py
 create mode 100644 openfold/checkpointing.py
 create mode 100644 openfold/dropout.py
 create mode 100644 openfold/evoformer.py
 create mode 100644 openfold/msa.py
 create mode 100644 openfold/outer_product_mean.py
 create mode 100644 openfold/pair_transition.py
 create mode 100644 openfold/primitives.py
 create mode 100644 openfold/tensor_utils.py
 create mode 100644 openfold/triangular_attention.py
 create mode 100644 openfold/triangular_multiplicative_update.py

diff --git a/evoformer_openfold/evoformer.py b/evoformer_openfold/evoformer.py
deleted file mode 100644
index cfd2bb2a2529..000000000000
--- a/evoformer_openfold/evoformer.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import torch
-import torch.nn as nn
-
-from .msa import MSAStack
-from .ops import OutProductMean
-from .triangle import PairStack
-
-
-def print_memory(init_mem, text=None):
-    now_mem = torch.cuda.memory_allocated() / 1024 ** 2 - init_mem
-    max_mem = torch.cuda.max_memory_allocated() / 1024 ** 2 - init_mem
-    print("%s now:%.2f max:%.2f" % ("" if text is None else text, now_mem, max_mem))
-    torch.cuda.reset_peak_memory_stats()
-
-
-class EvoformerBlock(nn.Module):
-
-    def __init__(self, d_node, d_pair):
-        super(EvoformerBlock, self).__init__()
-
-        self.msa_stack = MSAStack(d_node, d_pair, p_drop=0.15)
-        self.communication = OutProductMean(n_feat=d_node, n_feat_out=d_pair, n_feat_proj=32)
-        self.pair_stack = PairStack(d_pair=d_pair)
-
-    def forward(self, node, pair):
-        node = self.msa_stack(node, pair)
-        pair = pair + self.communication(node)
-        pair = self.pair_stack(pair)
-        return node, pair
-
-
-class Evoformer(nn.Module):
-
-    def __init__(self, d_node, d_pair):
-        super(Evoformer, self).__init__()
-
-        self.blocks = nn.ModuleList()
-        for _ in range(1):
-            self.blocks.append(EvoformerBlock(d_node, d_pair))
-
-    def forward(self, node, pair):
-        for b in self.blocks:
-            node, pair = b(node, pair)
-        return node, pair
-
-
-def evoformer_tiny():
-    return Evoformer(d_node=64, d_pair=32)
-
-
-def evoformer_base():
-    return Evoformer(d_node=256, d_pair=128)
-
-
-def evoformer_large():
-    return Evoformer(d_node=512, d_pair=256)
-
-
-__all__ = ['Evoformer', 'evoformer_base', 'evoformer_large']
diff --git a/evoformer_openfold/initializer.py b/evoformer_openfold/initializer.py
deleted file mode 100755
index c6ce0659e597..000000000000
--- a/evoformer_openfold/initializer.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import math
-
-import numpy as np
-import torch.nn as nn
-
-
-def glorot_uniform_af(x, gain=1.0):
-    """
-    initialize tensors the same as xavier_initializer in PyTorch, but the dimensions are different:
-    In PyTorch:
-    [feature_out, feature_in, n_head ...]
-    In Jax:
-    [... n_head, feature_in, feature_out]
-    However, there is a feature in original Alphafold2 code that they use the Jax version initializer to initialize tensors like:
-    [feature_in, n_head, feature_out]
-
-    In this function, we keep this feature to initialize [feature_in, n_head, ..., feature_out] tensors
-    """
-    fan_in, fan_out = x.shape[-2:]
-    if len(x.shape) > 2:
-        receptive_field_size = np.prod(x.shape[:-2])
-        fan_in *= receptive_field_size
-        fan_out *= receptive_field_size
-    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
-    dev = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
-
-    nn.init.uniform_(x, -dev, dev)
-
-    return x
diff --git a/evoformer_openfold/kernel.py b/evoformer_openfold/kernel.py
deleted file mode 100644
index 26ab5dc53261..000000000000
--- a/evoformer_openfold/kernel.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import torch
-import torch.nn.functional as F
-
-
-def bias_sigmod_ele(y, bias, z):
-    return torch.sigmoid(y + bias) * z
-
-
-def bias_dropout_add(x: torch.Tensor, bias: torch.Tensor, dropmask: torch.Tensor,
-                     residual: torch.Tensor, prob: float) -> torch.Tensor:
-    out = (x + bias) * F.dropout(dropmask, p=prob, training=False)
-    out = residual + out
-    return out
-
-
-def bias_ele_dropout_residual(ab: torch.Tensor, b: torch.Tensor, g: torch.Tensor,
-                              dropout_mask: torch.Tensor, Z_raw: torch.Tensor,
-                              prob: float) -> torch.Tensor:
-    return Z_raw + F.dropout(dropout_mask, p=prob, training=True) * (g * (ab + b))
\ No newline at end of file
diff --git a/evoformer_openfold/msa.py b/evoformer_openfold/msa.py
deleted file mode 100644
index cac456638a55..000000000000
--- a/evoformer_openfold/msa.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import math
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from torch.nn import LayerNorm
-
-from .kernel import bias_dropout_add
-from .ops import SelfAttention, Transition
-
-
-class MSARowAttentionWithPairBias(nn.Module):
-
-    def __init__(self, d_node, d_pair, c=32, n_head=8, p_drop=0.15):
-        super(MSARowAttentionWithPairBias, self).__init__()
-        self.d_node = d_node
-        self.d_pair = d_pair
-        self.c = c
-        self.n_head = n_head
-        self.p_drop = p_drop
-
-        self.layernormM = LayerNorm(d_node)
-        self.layernormZ = LayerNorm(d_pair)
-
-        _init_weights = torch.nn.init.normal_(torch.zeros([n_head, d_pair]),
-                                              std=1.0 / math.sqrt(d_pair))
-        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights, requires_grad=True)
-
-        self.attention = SelfAttention(qkv_dim=d_node,
-                                       c=c,
-                                       n_head=n_head,
-                                       out_dim=d_node,
-                                       gating=True,
-                                       last_bias_fuse=True)
-
-        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_node,)), requires_grad=True)
-
-    def forward(self, M_raw, Z):
-        ## Input projections
-        M = self.layernormM(M_raw)
-        Z = self.layernormZ(Z)
-        b = F.linear(Z, self.linear_b_weights)
-        b = b.permute(0, 3, 1, 2)
-        # b = rearrange(b, 'b q k h -> b h q k')
-
-        M = self.attention(M, b)
-        dropout_mask = torch.ones_like(M[:, 0:1, :, :]).to(M.device).to(M.dtype)
-
-        return bias_dropout_add(M, self.out_bias, dropout_mask, M_raw, prob=self.p_drop)
-
-
-class MSAColumnAttention(nn.Module):
-
-    def __init__(self, d_node, c=32, n_head=8):
-        super(MSAColumnAttention, self).__init__()
-        self.d_node = d_node
-        self.c = c
-        self.n_head = n_head
-
-        self.layernormM = LayerNorm(d_node)
-        self.attention = SelfAttention(qkv_dim=d_node,
-                                       c=c,
-                                       n_head=n_head,
-                                       out_dim=d_node,
-                                       gating=True)
-
-    def forward(self, M_raw):
-        M = M_raw.transpose(-2, -3)
-        M = self.layernormM(M)
-
-        M = self.attention(M)
-
-        M = M.transpose(-2, -3)
-        return M_raw + M
-
-
-class MSAStack(nn.Module):
-
-    def __init__(self, d_node, d_pair, p_drop=0.15):
-        super(MSAStack, self).__init__()
-
-        self.MSARowAttentionWithPairBias = MSARowAttentionWithPairBias(d_node=d_node,
-                                                                       d_pair=d_pair,
-                                                                       p_drop=p_drop)
-
-        self.MSAColumnAttention = MSAColumnAttention(d_node=d_node)
-        self.MSATransition = Transition(d=d_node)
-
-    def forward(self, node, pair):
-        node = self.MSARowAttentionWithPairBias(node, pair)
-        node = self.MSAColumnAttention(node)
-        node = self.MSATransition(node)
-
-        return node
diff --git a/evoformer_openfold/ops.py b/evoformer_openfold/ops.py
deleted file mode 100755
index 611b7b0fe777..000000000000
--- a/evoformer_openfold/ops.py
+++ /dev/null
@@ -1,176 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from torch.nn import LayerNorm
-
-from .initializer import glorot_uniform_af
-from .kernel import bias_sigmod_ele
-
-
-class DropoutRowwise(nn.Module):
-
-    def __init__(self, p):
-        super(DropoutRowwise, self).__init__()
-        self.p = p
-        self.dropout = nn.Dropout(p=p)
-
-    def forward(self, x):
-        dropout_mask = torch.ones_like(x[:, 0:1, :, :])
-        dropout_mask = self.dropout(dropout_mask)
-        return dropout_mask * x
-
-
-class DropoutColumnwise(nn.Module):
-
-    def __init__(self, p):
-        super(DropoutColumnwise, self).__init__()
-        self.p = p
-        self.dropout = nn.Dropout(p=p)
-
-    def forward(self, x):
-        dropout_mask = torch.ones_like(x[:, :, 0:1, :])
-        dropout_mask = self.dropout(dropout_mask)
-        return dropout_mask * x
-
-
-class Transition(nn.Module):
-
-    def __init__(self, d, n=4):
-        super(Transition, self).__init__()
-        self.norm = LayerNorm(d)
-        self.linear1 = Linear(d, n * d, initializer='relu')
-        self.linear2 = Linear(n * d, d, initializer='zeros')
-
-    def forward(self, src):
-        x = self.norm(src)
-        x = self.linear2(F.relu(self.linear1(x)))
-        return src + x
-
-
-class OutProductMean(nn.Module):
-
-    def __init__(self, n_feat=64, n_feat_out=128, n_feat_proj=32):
-        super(OutProductMean, self).__init__()
-
-        self.layernormM = LayerNorm(n_feat)
-        self.linear_a = Linear(n_feat, n_feat_proj)
-        self.linear_b = Linear(n_feat, n_feat_proj)
-
-        self.o_linear = Linear(n_feat_proj * n_feat_proj,
-                               n_feat_out,
-                               initializer='zero',
-                               use_bias=True)
-
-    def forward(self, M):
-        M = self.layernormM(M)
-        left_act = self.linear_a(M)
-        right_act = self.linear_b(M)
-
-        O = torch.einsum('bsid,bsje->bijde', left_act, right_act).contiguous()
-        # O = rearrange(O, 'b i j d e -> b i j (d e)')
-        O = O.reshape(O.shape[0], O.shape[1], O.shape[2], -1)
-        Z = self.o_linear(O)
-
-        return Z
-
-
-class Linear(nn.Linear):
-    """
-    A Linear layer with built-in nonstandard initializations. Called just
-    like torch.nn.Linear.
-    Implements the initializers in 1.11.4, plus some additional ones found
-    in the code.
-    """
-
-    def __init__(
-        self,
-        feature_in: int,
-        feature_out: int,
-        initializer: str = 'linear',
-        use_bias: bool = True,
-        bias_init: float = 0.,
-    ):
-        super(Linear, self).__init__(feature_in, feature_out, bias=use_bias)
-
-        self.use_bias = use_bias
-        if initializer == 'linear':
-            glorot_uniform_af(self.weight, gain=1.0)
-        elif initializer == 'relu':
-            glorot_uniform_af(self.weight, gain=2.0)
-        elif initializer == 'zeros':
-            nn.init.zeros_(self.weight)
-        if self.use_bias:
-            with torch.no_grad():
-                self.bias.fill_(bias_init)
-
-
-class SelfAttention(nn.Module):
-    """
-    Multi-Head SelfAttention dealing with [batch_size1, batch_size2, len, dim] tensors
-    """
-
-    def __init__(self, qkv_dim, c, n_head, out_dim, gating=True, last_bias_fuse=False):
-        super(SelfAttention, self).__init__()
-        self.qkv_dim = qkv_dim
-        self.c = c
-        self.n_head = n_head
-        self.out_dim = out_dim
-        self.gating = gating
-        self.last_bias_fuse = last_bias_fuse
-
-        self.scaling = self.c**(-0.5)
-
-        # self.to_qkv = Linear(qkv_dim, 3 * n_head * c, initializer='linear')
-        self.to_q = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
-        self.to_k = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
-        self.to_v = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
-
-        if gating:
-            self.gating_bias = nn.parameter.Parameter(data=torch.ones((n_head * c,)))
-            self.gating_linear = Linear(qkv_dim, n_head * c, initializer='zero', use_bias=False)
-
-        self.o_linear = Linear(n_head * c,
-                               out_dim,
-                               initializer='zero',
-                               use_bias=(not last_bias_fuse))
-
-    def forward(self, in_data, nonbatched_bias=None):
-        """
-        :param in_data: [batch_size1, batch_size2, len_qkv, qkv_dim]
-        :param bias: None or [batch_size1, batch_size2, n_head, len_q, len_kv]
-        :param nonbatched_bias: None or [batch_size1, n_head, len_q, len_kv]
-        """
-
-        # qkv = self.to_qkv(in_data).chunk(3, dim=-1)
-        # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head), qkv)
-
-        q = self.to_q(in_data)
-        k = self.to_k(in_data)
-        v = self.to_v(in_data)
-
-        # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head),
-        #               [q, k, v])
-        q, k, v = map(lambda t: t.view(t.shape[0], t.shape[1], t.shape[2], self.n_head, -1).permute(0, 1, 3, 2, 4),
-                      [q, k, v])
-        
-        q = q * self.scaling
-
-        logits = torch.matmul(q, k.transpose(-1, -2))
-
-        if nonbatched_bias is not None:
-            logits += nonbatched_bias.unsqueeze(1)
-        weights = torch.softmax(logits, dim=-1)
-        # weights = softmax(logits)
-
-        weighted_avg = torch.matmul(weights, v)
-        # weighted_avg = rearrange(weighted_avg, 'b1 b2 h n d -> b1 b2 n (h d)')
-        weighted_avg = weighted_avg.permute(0, 1, 3, 2, 4)
-        weighted_avg = weighted_avg.reshape(weighted_avg.shape[0], weighted_avg.shape[1], weighted_avg.shape[2], -1)
-
-        if self.gating:
-            gate_values = self.gating_linear(in_data)
-            weighted_avg = bias_sigmod_ele(gate_values, self.gating_bias, weighted_avg)
-
-        output = self.o_linear(weighted_avg)
-        return output
diff --git a/evoformer_openfold/triangle.py b/evoformer_openfold/triangle.py
deleted file mode 100644
index f479469c3836..000000000000
--- a/evoformer_openfold/triangle.py
+++ /dev/null
@@ -1,192 +0,0 @@
-import math
-
-import torch
-import torch.nn as nn
-from torch.nn import LayerNorm
-
-from .kernel import bias_dropout_add, bias_ele_dropout_residual
-from .ops import Linear, SelfAttention, Transition
-
-
-def permute_final_dims(tensor, inds):
-    zero_index = -1 * len(inds)
-    first_inds = list(range(len(tensor.shape[:zero_index])))
-    return tensor.permute(first_inds + [zero_index + i for i in inds])
-
-
-class TriangleMultiplicationOutgoing(nn.Module):
-
-    def __init__(self, d_pair, p_drop, c=128):
-        super(TriangleMultiplicationOutgoing, self).__init__()
-        self.d_pair = d_pair
-        self.c = c
-
-        self.layernorm1 = LayerNorm(d_pair)
-        self.left_projection = Linear(d_pair, c)
-        self.right_projection = Linear(d_pair, c)
-        self.left_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
-        self.right_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
-
-        self.output_gate = Linear(d_pair, d_pair, initializer='zeros', bias_init=1.)
-        self.layernorm2 = LayerNorm(c)
-        self.output_projection = Linear(d_pair, d_pair, initializer='zeros', use_bias=False)
-        self.output_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
-
-        self.p_drop = p_drop
-
-    def forward(self, Z_raw):
-        Z = self.layernorm1(Z_raw)
-        left_proj_act = self.left_projection(Z)
-        right_proj_act = self.right_projection(Z)
-
-        left_proj_act = left_proj_act * torch.sigmoid(self.left_gate(Z))
-        right_proj_act = right_proj_act * torch.sigmoid(self.right_gate(Z))
-
-        g = torch.sigmoid(self.output_gate(Z))
-        # p = torch.matmul(
-        #     permute_final_dims(left_proj_act, (2, 0, 1)),
-        #     permute_final_dims(right_proj_act, (2, 1, 0)),
-        # )
-        # ab = permute_final_dims(p, (1, 2, 0))
-
-        ab = torch.einsum('bikd,bjkd->bijd', left_proj_act, right_proj_act)
-        ab = self.output_projection(self.layernorm2(ab))
-        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
-        return bias_ele_dropout_residual(ab,
-                                         self.output_bias,
-                                         g,
-                                         dropout_mask,
-                                         Z_raw,
-                                         prob=self.p_drop)
-
-
-class TriangleMultiplicationIncoming(nn.Module):
-
-    def __init__(self, d_pair, p_drop, c=128):
-        super(TriangleMultiplicationIncoming, self).__init__()
-        self.d_pair = d_pair
-        self.c = c
-
-        self.layernorm1 = LayerNorm(d_pair)
-        self.left_projection = Linear(d_pair, c)
-        self.right_projection = Linear(d_pair, c)
-        self.left_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
-        self.right_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
-
-        self.output_gate = Linear(d_pair, d_pair, initializer='zeros', bias_init=1.)
-        self.layernorm2 = LayerNorm(c)
-        self.output_projection = Linear(d_pair, d_pair, initializer='zeros', use_bias=False)
-        self.output_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
-
-        self.p_drop = p_drop
-
-    def forward(self, Z_raw):
-        Z = self.layernorm1(Z_raw)
-        left_proj_act = self.left_projection(Z)
-        right_proj_act = self.right_projection(Z)
-
-        left_proj_act = left_proj_act * torch.sigmoid(self.left_gate(Z))
-        right_proj_act = right_proj_act * torch.sigmoid(self.right_gate(Z))
-
-        g = torch.sigmoid(self.output_gate(Z))
-        # p = torch.matmul(
-        #     permute_final_dims(left_proj_act, (2, 1, 0)),
-        #     permute_final_dims(right_proj_act, (2, 0, 1)),
-        # )
-        # ab = permute_final_dims(p, (1, 2, 0))
-
-        ab = torch.einsum('bkid,bkjd->bijd', left_proj_act, right_proj_act)
-        ab = self.output_projection(self.layernorm2(ab))
-        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
-        return bias_ele_dropout_residual(ab,
-                                         self.output_bias,
-                                         g,
-                                         dropout_mask,
-                                         Z_raw,
-                                         prob=self.p_drop)
-
-
-class TriangleAttentionStartingNode(nn.Module):
-
-    def __init__(self, d_pair, p_drop, c=32, n_head=4):
-        super(TriangleAttentionStartingNode, self).__init__()
-        self.d_pair = d_pair
-        self.c = c
-        self.n_head = n_head
-        self.p_drop = p_drop
-
-        self.layernorm1 = LayerNorm(d_pair)
-        _init_weights = torch.nn.init.normal_(torch.zeros([d_pair, n_head]),
-                                              std=1.0 / math.sqrt(d_pair))
-        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights)
-        self.attention = SelfAttention(qkv_dim=d_pair,
-                                       c=c,
-                                       n_head=n_head,
-                                       out_dim=d_pair,
-                                       gating=True,
-                                       last_bias_fuse=True)
-
-        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
-
-    def forward(self, Z_raw):
-        Z = self.layernorm1(Z_raw)
-        b = torch.einsum('bqkc,ch->bhqk', Z, self.linear_b_weights)
-
-        Z = self.attention(Z, b)
-
-        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
-        return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
-
-
-class TriangleAttentionEndingNode(nn.Module):
-
-    def __init__(self, d_pair, p_drop, c=32, n_head=4):
-        super(TriangleAttentionEndingNode, self).__init__()
-        self.d_pair = d_pair
-        self.c = c
-        self.n_head = n_head
-        self.p_drop = p_drop
-
-        self.layernorm1 = LayerNorm(d_pair)
-        _init_weights = torch.nn.init.normal_(torch.zeros([d_pair, n_head]),
-                                              std=1.0 / math.sqrt(d_pair))
-        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights)
-        self.attention = SelfAttention(qkv_dim=d_pair,
-                                       c=c,
-                                       n_head=n_head,
-                                       out_dim=d_pair,
-                                       gating=True,
-                                       last_bias_fuse=True)
-
-        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
-
-    def forward(self, Z_raw):
-        Z = Z_raw.transpose(-2, -3)
-        Z = self.layernorm1(Z)
-        b = torch.einsum('bqkc,ch->bhqk', Z, self.linear_b_weights)
-
-        Z = self.attention(Z, b)
-
-        Z = Z.transpose(-2, -3)
-        dropout_mask = torch.ones_like(Z[:, :, 0:1, :]).to(Z.device).to(Z.dtype)
-        return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
-
-
-class PairStack(nn.Module):
-
-    def __init__(self, d_pair, p_drop=0.25):
-        super(PairStack, self).__init__()
-
-        self.TriangleMultiplicationOutgoing = TriangleMultiplicationOutgoing(d_pair, p_drop=p_drop)
-        self.TriangleMultiplicationIncoming = TriangleMultiplicationIncoming(d_pair, p_drop=p_drop)
-        self.TriangleAttentionStartingNode = TriangleAttentionStartingNode(d_pair, p_drop=p_drop)
-        self.TriangleAttentionEndingNode = TriangleAttentionEndingNode(d_pair, p_drop=p_drop)
-        self.PairTransition = Transition(d=d_pair)
-
-    def forward(self, pair):
-        pair = self.TriangleMultiplicationOutgoing(pair)
-        pair = self.TriangleMultiplicationIncoming(pair)
-        pair = self.TriangleAttentionStartingNode(pair)
-        pair = self.TriangleAttentionEndingNode(pair)
-        pair = self.PairTransition(pair)
-        return pair
diff --git a/openfold/checkpointing.py b/openfold/checkpointing.py
new file mode 100644
index 000000000000..83e77c638ec1
--- /dev/null
+++ b/openfold/checkpointing.py
@@ -0,0 +1,84 @@
+# Copyright 2021 AlQuraishi Laboratory
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.utils.checkpoint
+from typing import Any, Tuple, List, Callable, Optional
+
+
+BLOCK_ARG = Any
+BLOCK_ARGS = List[BLOCK_ARG]
+
+
+def get_checkpoint_fn():
+    checkpoint = torch.utils.checkpoint.checkpoint
+
+    return checkpoint
+
+
+@torch.jit.ignore
+def checkpoint_blocks(
+    blocks: List[Callable],
+    args: BLOCK_ARGS,
+    blocks_per_ckpt: Optional[int],
+) -> BLOCK_ARGS:
+    """
+    Chunk a list of blocks and run each chunk with activation
+    checkpointing. We define a "block" as a callable whose only inputs are
+    the outputs of the previous block.
+
+    Implements Subsection 1.11.8
+
+    Args:
+        blocks:
+            List of blocks
+        args:
+            Tuple of arguments for the first block.
+        blocks_per_ckpt:
+            Size of each chunk. A higher value corresponds to fewer 
+            checkpoints, and trades memory for speed. If None, no checkpointing 
+            is performed.
+    Returns:
+        The output of the final block
+    """
+    def wrap(a):
+        return (a,) if type(a) is not tuple else a
+
+    def exec(b, a):
+        for block in b:
+            a = wrap(block(*a))
+        return a
+
+    def chunker(s, e):
+        def exec_sliced(*a):
+            return exec(blocks[s:e], a)
+
+        return exec_sliced
+
+    # Avoids mishaps when the blocks take just one argument
+    args = wrap(args)
+
+    if blocks_per_ckpt is None:
+        return exec(blocks, args)
+    elif blocks_per_ckpt < 1 or blocks_per_ckpt > len(blocks):
+        raise ValueError("blocks_per_ckpt must be between 1 and len(blocks)")
+
+    checkpoint = get_checkpoint_fn() 
+
+    for s in range(0, len(blocks), blocks_per_ckpt):
+        e = s + blocks_per_ckpt
+        args = checkpoint(chunker(s, e), *args)
+        args = wrap(args)
+
+    return args
diff --git a/openfold/dropout.py b/openfold/dropout.py
new file mode 100644
index 000000000000..651b9775ef44
--- /dev/null
+++ b/openfold/dropout.py
@@ -0,0 +1,78 @@
+# Copyright 2021 AlQuraishi Laboratory
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+import torch.nn as nn
+from functools import partialmethod
+from typing import Union, List
+
+
+class Dropout(nn.Module):
+    """
+    Implementation of dropout with the ability to share the dropout mask
+    along a particular dimension.
+
+    If not in training mode, this module computes the identity function.
+    """
+
+    def __init__(self, r: float, batch_dim: Union[int, List[int]]):
+        """
+        Args:
+            r:
+                Dropout rate
+            batch_dim:
+                Dimension(s) along which the dropout mask is shared
+        """
+        super(Dropout, self).__init__()
+
+        self.r = r
+        if type(batch_dim) == int:
+            batch_dim = [batch_dim]
+        self.batch_dim = batch_dim
+        self.dropout = nn.Dropout(self.r)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x:
+                Tensor to which dropout is applied. Can have any shape
+                compatible with self.batch_dim
+        """
+        shape = list(x.shape)
+        if self.batch_dim is not None:
+            for bd in self.batch_dim:
+                shape[bd] = 1
+        mask = x.new_ones(shape)
+        mask = self.dropout(mask)
+        x *= mask
+        return x
+
+
+class DropoutRowwise(Dropout):
+    """
+    Convenience class for rowwise dropout as described in subsection
+    1.11.6.
+    """
+
+    __init__ = partialmethod(Dropout.__init__, batch_dim=-3)
+
+
+class DropoutColumnwise(Dropout):
+    """
+    Convenience class for columnwise dropout as described in subsection
+    1.11.6.
+    """
+
+    __init__ = partialmethod(Dropout.__init__, batch_dim=-2)
diff --git a/openfold/evoformer.py b/openfold/evoformer.py
new file mode 100644
index 000000000000..21e422b04764
--- /dev/null
+++ b/openfold/evoformer.py
@@ -0,0 +1,636 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import torch
+import torch.nn as nn
+from typing import Tuple, Optional
+from functools import partial
+
+from openfold.primitives import Linear, LayerNorm
+from openfold.dropout import DropoutRowwise, DropoutColumnwise
+from openfold.msa import (
+    MSARowAttentionWithPairBias,
+    MSAColumnAttention,
+    MSAColumnGlobalAttention,
+)
+from openfold.outer_product_mean import OuterProductMean
+from openfold.pair_transition import PairTransition
+from openfold.triangular_attention import (
+    TriangleAttentionStartingNode,
+    TriangleAttentionEndingNode,
+)
+from openfold.triangular_multiplicative_update import (
+    TriangleMultiplicationOutgoing,
+    TriangleMultiplicationIncoming,
+)
+from openfold.checkpointing import checkpoint_blocks, get_checkpoint_fn
+from openfold.tensor_utils import chunk_layer
+
+
+class MSATransition(nn.Module):
+    """
+    Feed-forward network applied to MSA activations after attention.
+
+    Implements Algorithm 9
+    """
+    def __init__(self, c_m, n):
+        """
+        Args:
+            c_m:
+                MSA channel dimension
+            n:
+                Factor multiplied to c_m to obtain the hidden channel
+                dimension
+        """
+        super(MSATransition, self).__init__()
+
+        self.c_m = c_m
+        self.n = n
+
+        self.layer_norm = LayerNorm(self.c_m)
+        self.linear_1 = Linear(self.c_m, self.n * self.c_m, init="relu")
+        self.relu = nn.ReLU()
+        self.linear_2 = Linear(self.n * self.c_m, self.c_m, init="final")
+
+    def _transition(self, m, mask):
+        m = self.linear_1(m)
+        m = self.relu(m)
+        m = self.linear_2(m) * mask
+        return m
+
+    @torch.jit.ignore
+    def _chunk(self,
+        m: torch.Tensor,
+        mask: torch.Tensor,
+        chunk_size: int,
+    ) -> torch.Tensor:
+         return chunk_layer(
+             self._transition,
+             {"m": m, "mask": mask},
+             chunk_size=chunk_size,
+             no_batch_dims=len(m.shape[:-2]),
+         )
+
+    def forward(
+        self,
+        m: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        chunk_size: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            m:
+                [*, N_seq, N_res, C_m] MSA activation
+            mask:
+                [*, N_seq, N_res, C_m] MSA mask
+        Returns:
+            m:
+                [*, N_seq, N_res, C_m] MSA activation update
+        """
+
+        # DISCREPANCY: DeepMind forgets to apply the MSA mask here.
+        if mask is None:
+            mask = m.new_ones(m.shape[:-1])
+
+        # [*, N_seq, N_res, 1]
+        mask = mask.unsqueeze(-1)
+
+        m = self.layer_norm(m)
+
+        if chunk_size is not None:
+            m = self._chunk(m, mask, chunk_size)
+        else:
+            m = self._transition(m, mask)
+
+        return m
+
+
+class EvoformerBlockCore(nn.Module):
+    def __init__(
+        self,
+        c_m: int,
+        c_z: int,
+        c_hidden_opm: int,
+        c_hidden_mul: int,
+        c_hidden_pair_att: int,
+        no_heads_msa: int,
+        no_heads_pair: int,
+        transition_n: int,
+        pair_dropout: float,
+        inf: float,
+        eps: float,
+        _is_extra_msa_stack: bool = False,
+        is_multimer: bool = False,
+    ):
+        super(EvoformerBlockCore, self).__init__()
+        self.is_multimer = is_multimer
+        self.msa_transition = MSATransition(
+            c_m=c_m,
+            n=transition_n,
+        )
+
+        self.outer_product_mean = OuterProductMean(
+            c_m,
+            c_z,
+            c_hidden_opm,
+        )
+
+        self.tri_mul_out = TriangleMultiplicationOutgoing(
+            c_z,
+            c_hidden_mul,
+        )
+        self.tri_mul_in = TriangleMultiplicationIncoming(
+            c_z,
+            c_hidden_mul,
+        )
+
+        self.tri_att_start = TriangleAttentionStartingNode(
+            c_z,
+            c_hidden_pair_att,
+            no_heads_pair,
+            inf=inf,
+        )
+        self.tri_att_end = TriangleAttentionEndingNode(
+            c_z,
+            c_hidden_pair_att,
+            no_heads_pair,
+            inf=inf,
+        )
+
+        self.pair_transition = PairTransition(
+            c_z,
+            transition_n,
+        )
+
+        self.ps_dropout_row_layer = DropoutRowwise(pair_dropout)
+        self.ps_dropout_col_layer = DropoutColumnwise(pair_dropout)
+
+    def forward(
+        self,
+        m: torch.Tensor,
+        z: torch.Tensor,
+        msa_mask: torch.Tensor,
+        pair_mask: torch.Tensor,
+        chunk_size: Optional[int] = None,
+        _mask_trans: bool = True,
+    ) -> Tuple[torch.Tensor, torch.Tensor]: 
+        # DeepMind doesn't mask these transitions in the source, so _mask_trans
+        # should be disabled to better approximate the exact activations of
+        # the original.
+        msa_trans_mask = msa_mask if _mask_trans else None
+        pair_trans_mask = pair_mask if _mask_trans else None
+
+        m = m + self.msa_transition(
+            m, mask=msa_trans_mask, chunk_size=chunk_size
+        )
+        z = z + self.outer_product_mean(
+            m, mask=msa_mask, chunk_size=chunk_size
+        )
+        z = z + self.ps_dropout_row_layer(self.tri_mul_out(z, mask=pair_mask))
+        z = z + self.ps_dropout_row_layer(self.tri_mul_in(z, mask=pair_mask))
+        z = z + self.ps_dropout_row_layer(
+            self.tri_att_start(z, mask=pair_mask, chunk_size=chunk_size)
+        )
+        z = z + self.ps_dropout_col_layer(
+            self.tri_att_end(z, mask=pair_mask, chunk_size=chunk_size)
+        )
+        z = z + self.pair_transition(
+            z, mask=pair_trans_mask, chunk_size=chunk_size
+        )
+
+        return m, z
+
+
+class EvoformerBlock(nn.Module):
+    def __init__(self,
+        c_m: int,
+        c_z: int,
+        c_hidden_msa_att: int,
+        c_hidden_opm: int,
+        c_hidden_mul: int,
+        c_hidden_pair_att: int,
+        no_heads_msa: int,
+        no_heads_pair: int,
+        transition_n: int,
+        msa_dropout: float,
+        pair_dropout: float,
+        inf: float,
+        eps: float,
+        is_multimer: bool,
+    ):
+        super(EvoformerBlock, self).__init__()
+
+        self.msa_att_row = MSARowAttentionWithPairBias(
+            c_m=c_m,
+            c_z=c_z,
+            c_hidden=c_hidden_msa_att,
+            no_heads=no_heads_msa,
+            inf=inf,
+        )
+
+        self.msa_att_col = MSAColumnAttention(
+            c_m,
+            c_hidden_msa_att,
+            no_heads_msa,
+            inf=inf,
+        )
+
+        self.msa_dropout_layer = DropoutRowwise(msa_dropout)
+
+        self.core = EvoformerBlockCore(
+            c_m=c_m,
+            c_z=c_z,
+            c_hidden_opm=c_hidden_opm,
+            c_hidden_mul=c_hidden_mul,
+            c_hidden_pair_att=c_hidden_pair_att,
+            no_heads_msa=no_heads_msa,
+            no_heads_pair=no_heads_pair,
+            transition_n=transition_n,
+            pair_dropout=pair_dropout,
+            inf=inf,
+            eps=eps,
+        )
+        
+        self.outer_product_mean = OuterProductMean(
+            c_m,
+            c_z,
+            c_hidden_opm,
+        )
+        self.is_multimer = is_multimer
+
+    def forward(self,
+        m: torch.Tensor,
+        z: torch.Tensor,
+        msa_mask: torch.Tensor,
+        pair_mask: torch.Tensor,
+        chunk_size: Optional[int] = None,
+        _mask_trans: bool = True,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        m = m + self.msa_dropout_layer(
+            self.msa_att_row(m, z=z, mask=msa_mask, chunk_size=chunk_size)
+        )
+        m = m + self.msa_att_col(m, mask=msa_mask, chunk_size=chunk_size)
+        m, z = self.core(
+            m, 
+            z, 
+            msa_mask=msa_mask, 
+            pair_mask=pair_mask, 
+            chunk_size=chunk_size, 
+            _mask_trans=_mask_trans,
+        )
+
+        return m, z
+
+
+class ExtraMSABlock(nn.Module):
+    """ 
+        Almost identical to the standard EvoformerBlock, except in that the
+        ExtraMSABlock uses GlobalAttention for MSA column attention and
+        requires more fine-grained control over checkpointing. Separated from
+        its twin to preserve the TorchScript-ability of the latter.
+    """
+    def __init__(self,
+        c_m: int,
+        c_z: int,
+        c_hidden_msa_att: int,
+        c_hidden_opm: int,
+        c_hidden_mul: int,
+        c_hidden_pair_att: int,
+        no_heads_msa: int,
+        no_heads_pair: int,
+        transition_n: int,
+        msa_dropout: float,
+        pair_dropout: float,
+        inf: float,
+        eps: float,
+        ckpt: bool,
+        is_multimer: bool,
+    ):
+        super(ExtraMSABlock, self).__init__()
+        
+        self.ckpt = ckpt
+
+        self.msa_att_row = MSARowAttentionWithPairBias(
+            c_m=c_m,
+            c_z=c_z,
+            c_hidden=c_hidden_msa_att,
+            no_heads=no_heads_msa,
+            inf=inf,
+        )
+
+        self.msa_att_col = MSAColumnGlobalAttention(
+            c_in=c_m,
+            c_hidden=c_hidden_msa_att,
+            no_heads=no_heads_msa,
+            inf=inf,
+            eps=eps,
+        )
+
+        self.msa_dropout_layer = DropoutRowwise(msa_dropout)
+
+        self.core = EvoformerBlockCore(
+            c_m=c_m,
+            c_z=c_z,
+            c_hidden_opm=c_hidden_opm,
+            c_hidden_mul=c_hidden_mul,
+            c_hidden_pair_att=c_hidden_pair_att,
+            no_heads_msa=no_heads_msa,
+            no_heads_pair=no_heads_pair,
+            transition_n=transition_n,
+            pair_dropout=pair_dropout,
+            inf=inf,
+            eps=eps,
+        )
+        self.is_multimer = is_multimer
+
+    def forward(self,
+        m: torch.Tensor,
+        z: torch.Tensor,
+        msa_mask: torch.Tensor,
+        pair_mask: torch.Tensor,
+        chunk_size: Optional[int] = None,
+        _chunk_logits: Optional[int] = 1024,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        m = m + self.msa_dropout_layer(
+            self.msa_att_row(
+                m.clone(), 
+                z=z.clone(), 
+                mask=msa_mask, 
+                chunk_size=chunk_size,
+                _chunk_logits=_chunk_logits if torch.is_grad_enabled() else None,
+                _checkpoint_chunks=
+                    self.ckpt if torch.is_grad_enabled() else False,
+            )
+        )
+
+        def fn(m, z):
+            m = m + self.msa_att_col(m, mask=msa_mask, chunk_size=chunk_size)
+            m, z = self.core(
+                m, z, msa_mask=msa_mask, pair_mask=pair_mask, chunk_size=chunk_size
+            )
+            
+            return m, z
+
+        if(torch.is_grad_enabled() and self.ckpt):
+            checkpoint_fn = get_checkpoint_fn()
+            m, z = checkpoint_fn(fn, m, z)
+        else:
+            m, z = fn(m, z)
+
+        return m, z
+
+
+class EvoformerStack(nn.Module):
+    """
+    Main Evoformer trunk.
+
+    Implements Algorithm 6.
+    """
+
+    def __init__(
+        self,
+        c_m: int,
+        c_z: int,
+        c_hidden_msa_att: int,
+        c_hidden_opm: int,
+        c_hidden_mul: int,
+        c_hidden_pair_att: int,
+        c_s: int,
+        no_heads_msa: int,
+        no_heads_pair: int,
+        no_blocks: int,
+        transition_n: int,
+        msa_dropout: float,
+        pair_dropout: float,
+        blocks_per_ckpt: int,
+        inf: float,
+        eps: float,
+        clear_cache_between_blocks: bool = False, 
+        is_multimer: bool = False,
+        **kwargs,
+    ):
+        """
+        Args:
+            c_m:
+                MSA channel dimension
+            c_z:
+                Pair channel dimension
+            c_hidden_msa_att:
+                Hidden dimension in MSA attention
+            c_hidden_opm:
+                Hidden dimension in outer product mean module
+            c_hidden_mul:
+                Hidden dimension in multiplicative updates
+            c_hidden_pair_att:
+                Hidden dimension in triangular attention
+            c_s:
+                Channel dimension of the output "single" embedding
+            no_heads_msa:
+                Number of heads used for MSA attention
+            no_heads_pair:
+                Number of heads used for pair attention
+            no_blocks:
+                Number of Evoformer blocks in the stack
+            transition_n:
+                Factor by which to multiply c_m to obtain the MSATransition
+                hidden dimension
+            msa_dropout:
+                Dropout rate for MSA activations
+            pair_dropout:
+                Dropout used for pair activations
+            blocks_per_ckpt:
+                Number of Evoformer blocks in each activation checkpoint
+            clear_cache_between_blocks:
+                Whether to clear CUDA's GPU memory cache between blocks of the
+                stack. Slows down each block but can reduce fragmentation
+        """
+        super(EvoformerStack, self).__init__()
+
+        self.blocks_per_ckpt = blocks_per_ckpt
+        self.clear_cache_between_blocks = clear_cache_between_blocks
+
+        self.blocks = nn.ModuleList()
+
+        for _ in range(no_blocks):
+            block = EvoformerBlock(
+                c_m=c_m,
+                c_z=c_z,
+                c_hidden_msa_att=c_hidden_msa_att,
+                c_hidden_opm=c_hidden_opm,
+                c_hidden_mul=c_hidden_mul,
+                c_hidden_pair_att=c_hidden_pair_att,
+                no_heads_msa=no_heads_msa,
+                no_heads_pair=no_heads_pair,
+                transition_n=transition_n,
+                msa_dropout=msa_dropout,
+                pair_dropout=pair_dropout,
+                inf=inf,
+                eps=eps,
+                is_multimer=is_multimer,
+            )
+            self.blocks.append(block)
+
+        self.linear = Linear(c_m, c_s)
+
+    def forward(self,
+        m: torch.Tensor,
+        z: torch.Tensor,
+        msa_mask: torch.Tensor,
+        pair_mask: torch.Tensor,
+        chunk_size: int,
+        _mask_trans: bool = True,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Args:
+            m:
+                [*, N_seq, N_res, C_m] MSA embedding
+            z:
+                [*, N_res, N_res, C_z] pair embedding
+            msa_mask:
+                [*, N_seq, N_res] MSA mask
+            pair_mask:
+                [*, N_res, N_res] pair mask
+        Returns:
+            m:
+                [*, N_seq, N_res, C_m] MSA embedding
+            z:
+                [*, N_res, N_res, C_z] pair embedding
+            s:
+                [*, N_res, C_s] single embedding (or None if extra MSA stack)
+        """
+        blocks = [
+            partial(
+                b,
+                msa_mask=msa_mask,
+                pair_mask=pair_mask,
+                chunk_size=chunk_size,
+                _mask_trans=_mask_trans,
+            )
+            for b in self.blocks
+        ]
+
+        if(self.clear_cache_between_blocks):
+            def block_with_cache_clear(block, *args):
+                torch.cuda.empty_cache()
+                return block(*args)
+
+            blocks = [partial(block_with_cache_clear, b) for b in blocks]
+
+        m, z = checkpoint_blocks(
+            blocks,
+            args=(m, z),
+            blocks_per_ckpt=self.blocks_per_ckpt if self.training else None,
+        )
+
+        s = self.linear(m[..., 0, :, :])
+        
+        return m, z, s
+
+
+class ExtraMSAStack(nn.Module):
+    """
+    Implements Algorithm 18.
+    """
+
+    def __init__(self,
+        c_m: int,
+        c_z: int,
+        c_hidden_msa_att: int,
+        c_hidden_opm: int,
+        c_hidden_mul: int,
+        c_hidden_pair_att: int,
+        no_heads_msa: int,
+        no_heads_pair: int,
+        no_blocks: int,
+        transition_n: int,
+        msa_dropout: float,
+        pair_dropout: float,
+        inf: float,
+        eps: float,
+        ckpt: bool,
+        clear_cache_between_blocks: bool = False,
+        is_multimer: bool = False,
+        **kwargs,
+    ):
+        super(ExtraMSAStack, self).__init__()
+        
+        self.clear_cache_between_blocks = clear_cache_between_blocks
+        self.blocks = nn.ModuleList()
+        for _ in range(no_blocks):
+            block = ExtraMSABlock(
+                c_m=c_m,
+                c_z=c_z,
+                c_hidden_msa_att=c_hidden_msa_att,
+                c_hidden_opm=c_hidden_opm,
+                c_hidden_mul=c_hidden_mul,
+                c_hidden_pair_att=c_hidden_pair_att,
+                no_heads_msa=no_heads_msa,
+                no_heads_pair=no_heads_pair,
+                transition_n=transition_n,
+                msa_dropout=msa_dropout,
+                pair_dropout=pair_dropout,
+                inf=inf,
+                eps=eps,
+                ckpt=ckpt,
+                is_multimer=is_multimer,
+            )
+            self.blocks.append(block)
+
+    def forward(self,
+        m: torch.Tensor,
+        z: torch.Tensor,
+        chunk_size: int,
+        msa_mask: Optional[torch.Tensor] = None,
+        pair_mask: Optional[torch.Tensor] = None,
+        _mask_trans: bool = True,
+    ) -> torch.Tensor:
+        """
+        Args:
+            m:
+                [*, N_extra, N_res, C_m] extra MSA embedding
+            z:
+                [*, N_res, N_res, C_z] pair embedding
+            msa_mask:
+                Optional [*, N_extra, N_res] MSA mask
+            pair_mask:
+                Optional [*, N_res, N_res] pair mask
+        Returns:
+            [*, N_res, N_res, C_z] pair update
+        """ 
+        #checkpoint_fn = get_checkpoint_fn()
+        #blocks = [
+        #    partial(b, msa_mask=msa_mask, pair_mask=pair_mask, chunk_size=chunk_size, _chunk_logits=None) for b in self.blocks
+        #]
+
+        #def dodo(b, *args):
+        #    torch.cuda.empty_cache()
+        #    return b(*args)
+
+        #blocks = [partial(dodo, b) for b in blocks]
+
+        #for b in blocks:
+        #    if(torch.is_grad_enabled()):
+        #        m, z = checkpoint_fn(b, *(m, z))
+        #    else:
+        #        m, z = b(m, z)
+
+        for b in self.blocks:
+            m, z = b(m, z, msa_mask, pair_mask, chunk_size=chunk_size)
+
+            if(self.clear_cache_between_blocks):
+                torch.cuda.empty_cache()
+
+        return z
\ No newline at end of file
diff --git a/openfold/msa.py b/openfold/msa.py
new file mode 100644
index 000000000000..172b26def5f1
--- /dev/null
+++ b/openfold/msa.py
@@ -0,0 +1,392 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import torch
+import torch.nn as nn
+from typing import Optional, List, Tuple
+
+from openfold.primitives import (
+    Linear, 
+    LayerNorm,
+    Attention, 
+    GlobalAttention, 
+    _attention_chunked_trainable,
+)
+from openfold.checkpointing import get_checkpoint_fn
+from openfold.tensor_utils import (
+    chunk_layer,
+    permute_final_dims,
+    flatten_final_dims,
+)
+
+
+class MSAAttention(nn.Module):
+    def __init__(
+        self,
+        c_in,
+        c_hidden,
+        no_heads,
+        pair_bias=False,
+        c_z=None,
+        inf=1e9,
+    ):
+        """
+        Args:
+            c_in:
+                Input channel dimension
+            c_hidden:
+                Per-head hidden channel dimension
+            no_heads:
+                Number of attention heads
+            pair_bias:
+                Whether to use pair embedding bias
+            c_z:
+                Pair embedding channel dimension. Ignored unless pair_bias
+                is true
+            inf:
+                A large number to be used in computing the attention mask
+        """
+        super(MSAAttention, self).__init__()
+
+        self.c_in = c_in
+        self.c_hidden = c_hidden
+        self.no_heads = no_heads
+        self.pair_bias = pair_bias
+        self.c_z = c_z
+        self.inf = inf
+
+        self.layer_norm_m = LayerNorm(self.c_in)
+
+        self.layer_norm_z = None
+        self.linear_z = None
+        if self.pair_bias:
+            self.layer_norm_z = LayerNorm(self.c_z)
+            self.linear_z = Linear(
+                self.c_z, self.no_heads, bias=False, init="normal"
+            )
+        
+        self.mha = Attention(
+            self.c_in, self.c_in, self.c_in, self.c_hidden, self.no_heads
+        )
+
+    @torch.jit.ignore
+    def _chunk(self, 
+        m: torch.Tensor,
+        biases: List[torch.Tensor],
+        chunk_size: int,
+    ) -> torch.Tensor:
+        return chunk_layer(
+            self.mha,
+            {"q_x": m, "kv_x": m, "biases": biases},
+            chunk_size=chunk_size,
+            no_batch_dims=len(m.shape[:-2]),
+        )
+
+    def _prep_inputs(self,
+        m: torch.Tensor,
+        z: Optional[torch.Tensor],
+        mask: Optional[torch.Tensor]
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # [*, N_seq, N_res, C_m]
+        m = self.layer_norm_m(m)
+
+        n_seq, n_res = m.shape[-3:-1]
+        if mask is None:
+            # [*, N_seq, N_res]
+            mask = m.new_ones(
+                m.shape[:-3] + (n_seq, n_res),
+            )
+
+        # [*, N_seq, 1, 1, N_res]
+        mask_bias = (self.inf * (mask - 1))[..., :, None, None, :]
+
+        # This step simply returns a larger view of the bias, and does not
+        # consume additional memory.
+        # [*, N_seq, no_heads, N_res, N_res]
+        #bias = bias.expand(
+        #    ((-1,) * len(bias.shape[:-4])) + (-1, self.no_heads, n_res, -1)
+        #)
+
+        if (self.pair_bias and 
+            z is not None and                       # For the 
+            self.layer_norm_z is not None and       # benefit of
+            self.linear_z is not None               # TorchScript
+        ):
+            # [*, N_res, N_res, C_z]
+            z = self.layer_norm_z(z)
+            
+            # [*, N_res, N_res, no_heads]
+            z = self.linear_z(z)
+            
+            # [*, 1, no_heads, N_res, N_res]
+            z = permute_final_dims(z, (2, 0, 1)).unsqueeze(-4)
+
+        return m, mask_bias, z
+
+    @torch.jit.ignore
+    def _chunked_msa_attn(self,
+        m: torch.Tensor,
+        z: Optional[torch.Tensor],
+        mask: Optional[torch.Tensor],
+        chunk_logits: int,
+        checkpoint: bool,
+    ) -> torch.Tensor:
+        MSA_DIM = -4
+
+        def _get_qkv(m, z):
+            m, mask_bias, z = self._prep_inputs(m, z, mask)
+            q, k, v = self.mha._prep_qkv(m, m)
+            return m, q, k, v, mask_bias, z
+
+        checkpoint_fn = get_checkpoint_fn()
+
+        if(torch.is_grad_enabled() and checkpoint):
+            m, q, k, v, mask_bias, z = checkpoint_fn(_get_qkv, m, z)
+        else:
+            m, q, k, v, mask_bias, z = _get_qkv(m, z)
+       
+        o = _attention_chunked_trainable(
+            query=q, 
+            key=k, 
+            value=v, 
+            biases=[mask_bias, z], 
+            chunk_size=chunk_logits, 
+            chunk_dim=MSA_DIM,
+            checkpoint=checkpoint,
+        )
+
+        if(torch.is_grad_enabled() and checkpoint):
+            # Storing an additional m here is far from ideal
+            m = checkpoint_fn(self.mha._wrap_up, o, m)
+        else:
+            m = self.mha._wrap_up(o, m)
+
+        return m
+
+    def forward(self, 
+        m: torch.Tensor, 
+        z: Optional[torch.Tensor] = None, 
+        mask: Optional[torch.Tensor] = None, 
+        chunk_size: Optional[int] = None,
+        _chunk_logits: Optional[int] = None,
+        _checkpoint_chunks: Optional[bool] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            m:
+                [*, N_seq, N_res, C_m] MSA embedding
+            z:
+                [*, N_res, N_res, C_z] pair embedding. Required only if
+                pair_bias is True
+            mask:
+                [*, N_seq, N_res] MSA mask
+            chunk_size:
+                Size of chunks into which the inputs are split along their
+                batch dimensions. A low value decreases memory overhead at the 
+                cost of slower execution. Chunking is not performed by default.
+                
+        """
+        if(_chunk_logits is not None):
+            return self._chunked_msa_attn(
+                m=m, z=z, mask=mask, 
+                chunk_logits=_chunk_logits, checkpoint=_checkpoint_chunks
+            )           
+
+        m, mask_bias, z = self._prep_inputs(m, z, mask)
+
+        biases = [mask_bias]
+        if(z is not None):
+            biases.append(z)
+
+        if chunk_size is not None:
+            m = self._chunk(m, biases, chunk_size)
+        else:
+            m = self.mha(
+                q_x=m, 
+                kv_x=m, 
+                biases=biases 
+            )
+
+        return m
+
+
+class MSARowAttentionWithPairBias(MSAAttention):
+    """
+    Implements Algorithm 7.
+    """
+
+    def __init__(self, c_m, c_z, c_hidden, no_heads, inf=1e9):
+        """
+        Args:
+            c_m:
+                Input channel dimension
+            c_z:
+                Pair embedding channel dimension
+            c_hidden:
+                Per-head hidden channel dimension
+            no_heads:
+                Number of attention heads
+            inf:
+                Large number used to construct attention masks
+        """
+        super(MSARowAttentionWithPairBias, self).__init__(
+            c_m,
+            c_hidden,
+            no_heads,
+            pair_bias=True,
+            c_z=c_z,
+            inf=inf,
+        )
+
+
+class MSAColumnAttention(nn.Module):
+    """
+    Implements Algorithm 8.
+
+    By rights, this should also be a subclass of MSAAttention. Alas,
+    most inheritance isn't supported by TorchScript.
+    """
+
+    def __init__(self, c_m, c_hidden, no_heads, inf=1e9):
+        """
+        Args:
+            c_m:
+                MSA channel dimension
+            c_hidden:
+                Per-head hidden channel dimension
+            no_heads:
+                Number of attention heads
+            inf:
+                Large number used to construct attention masks
+        """
+        super(MSAColumnAttention, self).__init__()
+        
+        self.c_m = c_m
+        self.c_hidden = c_hidden
+        self.no_heads = no_heads
+        self.inf = inf
+
+        self._msa_att = MSAAttention(
+            c_in=c_m,
+            c_hidden=c_hidden,
+            no_heads=no_heads,
+            pair_bias=False,
+            c_z=None,
+            inf=inf,
+        )
+
+    def forward(self, 
+        m: torch.Tensor, 
+        mask: Optional[torch.Tensor] = None, 
+        chunk_size: Optional[int] = None
+    ) -> torch.Tensor:
+        """
+        Args:
+            m:
+                [*, N_seq, N_res, C_m] MSA embedding
+            mask:
+                [*, N_seq, N_res] MSA mask
+            chunk_size:
+                Size of chunks into which the inputs are split along their
+                batch dimensions. A low value decreases memory overhead at the 
+                cost of slower execution. Chunking is not performed by default.
+        """ 
+        # [*, N_res, N_seq, C_in]
+        m = m.transpose(-2, -3)
+        if mask is not None:
+            mask = mask.transpose(-1, -2)
+
+        m = self._msa_att(m, mask=mask, chunk_size=chunk_size)
+
+        # [*, N_seq, N_res, C_in]
+        m = m.transpose(-2, -3)
+        if mask is not None:
+            mask = mask.transpose(-1, -2)
+
+        return m
+
+
+class MSAColumnGlobalAttention(nn.Module):
+    def __init__(
+        self, c_in, c_hidden, no_heads, inf=1e9, eps=1e-10,
+    ):
+        super(MSAColumnGlobalAttention, self).__init__()
+
+        self.c_in = c_in
+        self.c_hidden = c_hidden
+        self.no_heads = no_heads
+        self.inf = inf
+        self.eps = eps
+
+        self.layer_norm_m = nn.LayerNorm(c_in)
+
+        self.global_attention = GlobalAttention(
+            c_in=c_in,
+            c_hidden=c_hidden,
+            no_heads=no_heads,
+            inf=inf,
+            eps=eps,
+        )
+
+    @torch.jit.ignore
+    def _chunk(self,
+        m: torch.Tensor,
+        mask: torch.Tensor,
+        chunk_size: int,
+    ) -> torch.Tensor:
+        mha_input = {
+            "m": m,
+            "mask": mask,
+        }
+        return chunk_layer(
+            self.global_attention,
+            mha_input,
+            chunk_size=chunk_size,
+            no_batch_dims=len(m.shape[:-2]),
+        )
+
+    def forward(
+        self, 
+        m: torch.Tensor, 
+        mask: Optional[torch.Tensor] = None, 
+        chunk_size: Optional[int] = None,
+    ) -> torch.Tensor:
+        n_seq, n_res, c_in = m.shape[-3:]
+
+        if mask is None:
+            # [*, N_seq, N_res]
+            mask = torch.ones(
+                m.shape[:-1],
+                dtype=m.dtype,
+                device=m.device,
+            ).detach()
+
+        # [*, N_res, N_seq, C_in]
+        m = m.transpose(-2, -3)
+        mask = mask.transpose(-1, -2)
+
+        # [*, N_res, N_seq, C_in]
+        m = self.layer_norm_m(m)
+
+        if chunk_size is not None:
+            m = self._chunk(m, mask, chunk_size) 
+        else:
+            m = self.global_attention(m=m, mask=mask)
+
+        # [*, N_seq, N_res, C_in]
+        m = m.transpose(-2, -3)
+
+        return m
diff --git a/openfold/outer_product_mean.py b/openfold/outer_product_mean.py
new file mode 100644
index 000000000000..43d853833c66
--- /dev/null
+++ b/openfold/outer_product_mean.py
@@ -0,0 +1,129 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from openfold.primitives import Linear
+from openfold.tensor_utils import chunk_layer
+
+
+class OuterProductMean(nn.Module):
+    """
+    Implements Algorithm 10.
+    """
+
+    def __init__(self, c_m, c_z, c_hidden, eps=1e-3):
+        """
+        Args:
+            c_m:
+                MSA embedding channel dimension
+            c_z:
+                Pair embedding channel dimension
+            c_hidden:
+                Hidden channel dimension
+        """
+        super(OuterProductMean, self).__init__()
+
+        self.c_m = c_m
+        self.c_z = c_z
+        self.c_hidden = c_hidden
+        self.eps = eps
+
+        self.layer_norm = nn.LayerNorm(c_m)
+        self.linear_1 = Linear(c_m, c_hidden)
+        self.linear_2 = Linear(c_m, c_hidden)
+        self.linear_out = Linear(c_hidden ** 2, c_z, init="final")
+
+    def _opm(self, a, b):
+        # [*, N_res, N_res, C, C]
+        outer = torch.einsum("...bac,...dae->...bdce", a, b)
+
+        # [*, N_res, N_res, C * C]
+        outer = outer.reshape(outer.shape[:-2] + (-1,))
+
+        # [*, N_res, N_res, C_z]
+        outer = self.linear_out(outer)
+
+        return outer
+
+    @torch.jit.ignore
+    def _chunk(self, 
+        a: torch.Tensor, 
+        b: torch.Tensor, 
+        chunk_size: int
+    ) -> torch.Tensor:
+        # Since the "batch dim" in this case is not a true batch dimension
+        # (in that the shape of the output depends on it), we need to
+        # iterate over it ourselves
+        a_reshape = a.reshape((-1,) + a.shape[-3:])
+        b_reshape = b.reshape((-1,) + b.shape[-3:])
+        out = []
+        for a_prime, b_prime in zip(a_reshape, b_reshape):
+            outer = chunk_layer(
+                partial(self._opm, b=b_prime),
+                {"a": a_prime},
+                chunk_size=chunk_size,
+                no_batch_dims=1,
+            )
+            out.append(outer)
+        outer = torch.stack(out, dim=0)
+        outer = outer.reshape(a.shape[:-3] + outer.shape[1:])
+
+        return outer
+
+    def forward(self, 
+        m: torch.Tensor, 
+        mask: Optional[torch.Tensor] = None,
+        chunk_size: Optional[int] = None
+    ) -> torch.Tensor:
+        """
+        Args:
+            m:
+                [*, N_seq, N_res, C_m] MSA embedding
+            mask:
+                [*, N_seq, N_res] MSA mask
+        Returns:
+            [*, N_res, N_res, C_z] pair embedding update
+        """
+        if mask is None:
+            mask = m.new_ones(m.shape[:-1])
+
+        # [*, N_seq, N_res, C_m]
+        m = self.layer_norm(m)
+
+        # [*, N_seq, N_res, C]
+        mask = mask.unsqueeze(-1)
+        a = self.linear_1(m) * mask
+        b = self.linear_2(m) * mask
+
+        a = a.transpose(-2, -3)
+        b = b.transpose(-2, -3)
+
+        if chunk_size is not None:
+            outer = self._chunk(a, b, chunk_size)
+        else:
+            outer = self._opm(a, b)
+
+        # [*, N_res, N_res, 1]
+        norm = torch.einsum("...abc,...adc->...bdc", mask, mask)
+
+        # [*, N_res, N_res, C_z]
+        outer = outer / (self.eps + norm)
+
+        return outer
diff --git a/openfold/pair_transition.py b/openfold/pair_transition.py
new file mode 100644
index 000000000000..de76306418ee
--- /dev/null
+++ b/openfold/pair_transition.py
@@ -0,0 +1,99 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from openfold.primitives import Linear, LayerNorm
+from openfold.tensor_utils import chunk_layer
+
+
+class PairTransition(nn.Module):
+    """
+    Implements Algorithm 15.
+    """
+
+    def __init__(self, c_z, n):
+        """
+        Args:
+            c_z:
+                Pair transition channel dimension
+            n:
+                Factor by which c_z is multiplied to obtain hidden channel
+                dimension
+        """
+        super(PairTransition, self).__init__()
+
+        self.c_z = c_z
+        self.n = n
+
+        self.layer_norm = LayerNorm(self.c_z)
+        self.linear_1 = Linear(self.c_z, self.n * self.c_z, init="relu")
+        self.relu = nn.ReLU()
+        self.linear_2 = Linear(self.n * self.c_z, c_z, init="final")
+
+    def _transition(self, z, mask):
+        # [*, N_res, N_res, C_hidden]
+        z = self.linear_1(z)
+        z = self.relu(z)
+
+        # [*, N_res, N_res, C_z]
+        z = self.linear_2(z) * mask
+
+        return z
+
+    @torch.jit.ignore
+    def _chunk(self,
+        z: torch.Tensor,
+        mask: torch.Tensor,
+        chunk_size: int,
+    ) -> torch.Tensor:
+        return chunk_layer(
+            self._transition,
+            {"z": z, "mask": mask},
+            chunk_size=chunk_size,
+            no_batch_dims=len(z.shape[:-2]),
+        )
+
+
+    def forward(self, 
+        z: torch.Tensor, 
+        mask: Optional[torch.Tensor] = None,
+        chunk_size: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            z:
+                [*, N_res, N_res, C_z] pair embedding
+        Returns:
+            [*, N_res, N_res, C_z] pair embedding update
+        """
+        # DISCREPANCY: DeepMind forgets to apply the mask in this module.
+        if mask is None:
+            mask = z.new_ones(z.shape[:-1])
+
+        # [*, N_res, N_res, 1]
+        mask = mask.unsqueeze(-1)
+
+        # [*, N_res, N_res, C_z]
+        z = self.layer_norm(z)
+
+        if chunk_size is not None:
+            z = self._chunk(z, mask, chunk_size)
+        else:
+            z = self._transition(z=z, mask=mask)
+
+        return z
diff --git a/openfold/primitives.py b/openfold/primitives.py
new file mode 100644
index 000000000000..bbc156f21d4a
--- /dev/null
+++ b/openfold/primitives.py
@@ -0,0 +1,529 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+import math
+from typing import Optional, Callable, List, Tuple, Sequence
+import numpy as np
+
+import torch
+import torch.nn as nn
+
+from openfold.checkpointing import get_checkpoint_fn
+from openfold.tensor_utils import (
+    permute_final_dims,
+    flatten_final_dims,
+    _chunk_slice,
+)
+
+
+def _prod(nums):
+    out = 1
+    for n in nums:
+        out = out * n
+    return out
+
+
+def _calculate_fan(linear_weight_shape, fan="fan_in"):
+    fan_out, fan_in = linear_weight_shape
+
+    if fan == "fan_in":
+        f = fan_in
+    elif fan == "fan_out":
+        f = fan_out
+    elif fan == "fan_avg":
+        f = (fan_in + fan_out) / 2
+    else:
+        raise ValueError("Invalid fan option")
+
+    return f
+
+
+def glorot_uniform_init_(weights):
+    nn.init.xavier_uniform_(weights, gain=1)
+
+
+def final_init_(weights):
+    with torch.no_grad():
+        weights.fill_(0.0)
+
+
+def gating_init_(weights):
+    with torch.no_grad():
+        weights.fill_(0.0)
+
+
+def normal_init_(weights):
+    torch.nn.init.kaiming_normal_(weights, nonlinearity="linear")
+
+
+def ipa_point_weights_init_(weights):
+    with torch.no_grad():
+        softplus_inverse_1 = 0.541324854612918
+        weights.fill_(softplus_inverse_1)
+
+
+class Linear(nn.Linear):
+    """
+    A Linear layer with built-in nonstandard initializations. Called just
+    like torch.nn.Linear.
+
+    Implements the initializers in 1.11.4, plus some additional ones found
+    in the code.
+    """
+
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        bias: bool = True,
+        init: str = "default",
+        init_fn: Optional[Callable[[torch.Tensor, torch.Tensor], None]] = None,
+    ):
+        """
+        Args:
+            in_dim:
+                The final dimension of inputs to the layer
+            out_dim:
+                The final dimension of layer outputs
+            bias:
+                Whether to learn an additive bias. True by default
+            init:
+                The initializer to use. Choose from:
+
+                "default": LeCun fan-in truncated normal initialization
+                "relu": He initialization w/ truncated normal distribution
+                "glorot": Fan-average Glorot uniform initialization
+                "gating": Weights=0, Bias=1
+                "normal": Normal initialization with std=1/sqrt(fan_in)
+                "final": Weights=0, Bias=0
+
+                Overridden by init_fn if the latter is not None.
+            init_fn:
+                A custom initializer taking weight and bias as inputs.
+                Overrides init if not None.
+        """
+        super(Linear, self).__init__(in_dim, out_dim, bias=bias)
+
+        if bias:
+            with torch.no_grad():
+                self.bias.fill_(0)
+
+        if init_fn is not None:
+            init_fn(self.weight, self.bias)
+        else:
+            if init == "default":
+                normal_init_(self.weight)
+            elif init == "relu":
+                normal_init_(self.weight)
+            elif init == "glorot":
+                glorot_uniform_init_(self.weight)
+            elif init == "gating":
+                gating_init_(self.weight)
+                if bias:
+                    with torch.no_grad():
+                        self.bias.fill_(1.0)
+            elif init == "normal":
+                normal_init_(self.weight)
+            elif init == "final":
+                final_init_(self.weight)
+            else:
+                raise ValueError("Invalid init string.")
+
+
+class LayerNorm(nn.Module):
+
+    def __init__(self, c_in, eps=1e-5):
+        super(LayerNorm, self).__init__()
+
+        self.c_in = (c_in,)
+        self.eps = eps
+
+        self.weight = nn.Parameter(torch.ones(c_in))
+        self.bias = nn.Parameter(torch.zeros(c_in))
+
+    def forward(self, x):
+        out = nn.functional.layer_norm(
+            x,
+            self.c_in,
+            self.weight,
+            self.bias,
+            self.eps,
+        )
+
+        return out
+
+
+@torch.jit.ignore
+def softmax(t: torch.Tensor, dim: int = -1) -> torch.Tensor:
+    """
+        Softmax, but without automatic casting to fp32 when the input is of
+        type bfloat16
+    """
+    s = torch.nn.functional.softmax(t, dim=dim)
+
+    return s
+
+
+#@torch.jit.script
+def _attention(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
+               biases: List[torch.Tensor]) -> torch.Tensor:
+    # [*, H, Q, C_hidden]
+    query = permute_final_dims(query, (1, 0, 2))
+
+    # [*, H, C_hidden, K]
+    key = permute_final_dims(key, (1, 2, 0))
+
+    # [*, H, V, C_hidden]
+    value = permute_final_dims(value, (1, 0, 2))
+
+    # [*, H, Q, K]
+    a = torch.matmul(query, key)
+
+    for b in biases:
+        a += b
+
+    a = softmax(a, -1)
+
+    # [*, H, Q, C_hidden]
+    a = torch.matmul(a, value)
+
+    # [*, Q, H, C_hidden]
+    a = a.transpose(-2, -3)
+
+    return a
+
+
+@torch.jit.ignore
+def _attention_chunked_trainable(
+    query,
+    key,
+    value,
+    biases,
+    chunk_size,
+    chunk_dim,
+    checkpoint,
+):
+    if (checkpoint and len(biases) > 2):
+        raise ValueError("Checkpointed version permits only permits two bias terms")
+
+    def _checkpointable_attention(q, k, v, b1, b2):
+        bs = [b for b in [b1, b2] if b is not None]
+        return _attention(q, k, v, bs)
+
+    o_chunks = []
+    checkpoint_fn = get_checkpoint_fn()
+    count = query.shape[chunk_dim]
+    for start in range(0, count, chunk_size):
+        end = start + chunk_size
+        idx = [slice(None)] * len(query.shape)
+        idx[chunk_dim] = slice(start, end)
+        idx_tup = tuple(idx)
+        q_chunk = query[idx_tup]
+        k_chunk = key[idx_tup]
+        v_chunk = value[idx_tup]
+
+        def _slice_bias(b):
+            idx[chunk_dim] = (slice(start, end) if b.shape[chunk_dim] != 1 else slice(None))
+            return b[tuple(idx)]
+
+        if (checkpoint):
+            bias_1_chunk, bias_2_chunk = [
+                _slice_bias(b) if b is not None else None for b in (biases + [None, None])[:2]
+            ]
+
+            o_chunk = checkpoint_fn(_checkpointable_attention, q_chunk, k_chunk, v_chunk,
+                                    bias_1_chunk, bias_2_chunk)
+        else:
+            bias_chunks = [_slice_bias(b) for b in biases]
+
+            o_chunk = _attention(q_chunk, k_chunk, v_chunk, bias_chunks)
+
+        o_chunks.append(o_chunk)
+
+    o = torch.cat(o_chunks, dim=chunk_dim)
+    return o
+
+
+class Attention(nn.Module):
+    """
+    Standard multi-head attention using AlphaFold's default layer
+    initialization. Allows multiple bias vectors.
+    """
+
+    def __init__(
+        self,
+        c_q: int,
+        c_k: int,
+        c_v: int,
+        c_hidden: int,
+        no_heads: int,
+        gating: bool = True,
+    ):
+        """
+        Args:
+            c_q:
+                Input dimension of query data
+            c_k:
+                Input dimension of key data
+            c_v:
+                Input dimension of value data
+            c_hidden:
+                Per-head hidden dimension
+            no_heads:
+                Number of attention heads
+            gating:
+                Whether the output should be gated using query data
+        """
+        super(Attention, self).__init__()
+
+        self.c_q = c_q
+        self.c_k = c_k
+        self.c_v = c_v
+        self.c_hidden = c_hidden
+        self.no_heads = no_heads
+        self.gating = gating
+
+        # DISCREPANCY: c_hidden is not the per-head channel dimension, as
+        # stated in the supplement, but the overall channel dimension.
+
+        self.linear_q = Linear(self.c_q, self.c_hidden * self.no_heads, bias=False, init="glorot")
+        self.linear_k = Linear(self.c_k, self.c_hidden * self.no_heads, bias=False, init="glorot")
+        self.linear_v = Linear(self.c_v, self.c_hidden * self.no_heads, bias=False, init="glorot")
+        self.linear_o = Linear(self.c_hidden * self.no_heads, self.c_q, init="final")
+
+        self.linear_g = None
+        if self.gating:
+            self.linear_g = Linear(self.c_q, self.c_hidden * self.no_heads, init="gating")
+
+        self.sigmoid = nn.Sigmoid()
+
+    def _prep_qkv(self, q_x: torch.Tensor,
+                  kv_x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # [*, Q/K/V, H * C_hidden]
+        q = self.linear_q(q_x)
+        k = self.linear_k(kv_x)
+        v = self.linear_v(kv_x)
+
+        # [*, Q/K, H, C_hidden]
+        q = q.view(q.shape[:-1] + (self.no_heads, -1))
+        k = k.view(k.shape[:-1] + (self.no_heads, -1))
+        v = v.view(v.shape[:-1] + (self.no_heads, -1))
+
+        q /= math.sqrt(self.c_hidden)
+
+        return q, k, v
+
+    def _wrap_up(self, o: torch.Tensor, q_x: torch.Tensor) -> torch.Tensor:
+        if (self.linear_g is not None):
+            g = self.sigmoid(self.linear_g(q_x))
+
+            # [*, Q, H, C_hidden]
+            g = g.view(g.shape[:-1] + (self.no_heads, -1))
+            o = o * g
+
+        # [*, Q, H * C_hidden]
+        o = flatten_final_dims(o, 2)
+
+        # [*, Q, C_q]
+        o = self.linear_o(o)
+
+        return o
+
+    def forward(
+        self,
+        q_x: torch.Tensor,
+        kv_x: torch.Tensor,
+        biases: Optional[List[torch.Tensor]] = None,
+        use_lma: bool = False,
+        q_chunk_size: Optional[int] = None,
+        kv_chunk_size: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            q_x:
+                [*, Q, C_q] query data
+            kv_x:
+                [*, K, C_k] key data
+            biases:
+                List of biases that broadcast to [*, H, Q, K]
+            use_lma:
+                Whether to use low-memory attention
+            q_chunk_size:
+                Query chunk size (for LMA)
+            kv_chunk_size:
+                Key/Value chunk size (for LMA)
+        Returns
+            [*, Q, C_q] attention update
+        """
+        if (biases is None):
+            biases = []
+        if (use_lma and (q_chunk_size is None or kv_chunk_size is None)):
+            raise ValueError("If use_lma is specified, q_chunk_size and kv_chunk_size must "
+                             "be provided")
+
+        q, k, v = self._prep_qkv(q_x, kv_x)
+
+        if (use_lma):
+            biases = [b.expand(b.shape[:-2] + (q_x.shape[-2],) + (kv_x.shape[-2],)) for b in biases]
+
+            o = _lma(q, k, v, biases, q_chunk_size, kv_chunk_size)
+        else:
+            o = _attention(q, k, v, biases)
+
+        o = self._wrap_up(o, q_x)
+
+        return o
+
+
+class GlobalAttention(nn.Module):
+
+    def __init__(self, c_in, c_hidden, no_heads, inf, eps):
+        super(GlobalAttention, self).__init__()
+
+        self.c_in = c_in
+        self.c_hidden = c_hidden
+        self.no_heads = no_heads
+        self.inf = inf
+        self.eps = eps
+
+        self.linear_q = Linear(c_in, c_hidden * no_heads, bias=False, init="glorot")
+
+        self.linear_k = Linear(
+            c_in,
+            c_hidden,
+            bias=False,
+            init="glorot",
+        )
+        self.linear_v = Linear(
+            c_in,
+            c_hidden,
+            bias=False,
+            init="glorot",
+        )
+        self.linear_g = Linear(c_in, c_hidden * no_heads, init="gating")
+        self.linear_o = Linear(c_hidden * no_heads, c_in, init="final")
+
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, m: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        # [*, N_res, C_in]
+        q = torch.sum(m * mask.unsqueeze(-1),
+                      dim=-2) / (torch.sum(mask, dim=-1)[..., None] + self.eps)
+
+        # [*, N_res, H * C_hidden]
+        q = self.linear_q(q)
+        q *= (self.c_hidden**(-0.5))
+
+        # [*, N_res, H, C_hidden]
+        q = q.view(q.shape[:-1] + (self.no_heads, -1))
+
+        # [*, N_res, N_seq, C_hidden]
+        k = self.linear_k(m)
+        v = self.linear_v(m)
+
+        # [*, N_res, H, N_seq]
+        a = torch.matmul(
+            q,
+            k.transpose(-1, -2),  # [*, N_res, C_hidden, N_seq]
+        )
+        bias = (self.inf * (mask - 1))[..., :, None, :]
+        a += bias
+        a = softmax(a)
+
+        # [*, N_res, H, C_hidden]
+        o = torch.matmul(
+            a,
+            v,
+        )
+
+        # [*, N_res, N_seq, C_hidden]
+        g = self.sigmoid(self.linear_g(m))
+
+        # [*, N_res, N_seq, H, C_hidden]
+        g = g.view(g.shape[:-1] + (self.no_heads, -1))
+
+        # [*, N_res, N_seq, H, C_hidden]
+        o = o.unsqueeze(-3) * g
+
+        # [*, N_res, N_seq, H * C_hidden]
+        o = o.reshape(o.shape[:-2] + (-1,))
+
+        # [*, N_res, N_seq, C_in]
+        m = self.linear_o(o)
+
+        return m
+
+
+def _lma(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    biases: List[torch.Tensor],
+    q_chunk_size: int,
+    kv_chunk_size: int,
+):
+    no_q, no_kv = q.shape[-3], k.shape[-3]
+
+    # [*, Q, H, C_hidden]
+    o = q.new_zeros(q.shape)
+    for q_s in range(0, no_q, q_chunk_size):
+        q_chunk = q[..., q_s:q_s + q_chunk_size, :, :]
+        large_bias_chunks = [b[..., q_s:q_s + q_chunk_size, :] for b in biases]
+
+        maxes = []
+        weights = []
+        values = []
+        for kv_s in range(0, no_kv, kv_chunk_size):
+            k_chunk = k[..., kv_s:kv_s + kv_chunk_size, :, :]
+            v_chunk = v[..., kv_s:kv_s + kv_chunk_size, :, :]
+            small_bias_chunks = [b[..., kv_s:kv_s + kv_chunk_size] for b in large_bias_chunks]
+
+            a = torch.einsum(
+                "...qhd,...khd->...hqk",
+                q_chunk,
+                k_chunk,
+            )
+
+            for b in small_bias_chunks:
+                a += b
+
+            a = a.transpose(-2, -3)
+
+            max_a = torch.max(a, dim=-1, keepdim=True)[0]
+            exp_a = torch.exp(a - max_a)
+            exp_v = torch.einsum("...vhf,...qhv->...qhf", v_chunk, exp_a)
+
+            maxes.append(max_a.detach().squeeze(-1))
+            weights.append(torch.sum(exp_a, dim=-1))
+            values.append(exp_v)
+
+        chunk_max = torch.stack(maxes, dim=-3)
+        chunk_weights = torch.stack(weights, dim=-3)
+        chunk_values = torch.stack(values, dim=-4)
+
+        global_max = torch.max(chunk_max, dim=-3, keepdim=True)[0]
+        max_diffs = torch.exp(chunk_max - global_max)
+        chunk_values *= max_diffs.unsqueeze(-1)
+        chunk_weights *= max_diffs
+
+        all_values = torch.sum(chunk_values, dim=-4)
+        all_weights = torch.sum(chunk_weights.unsqueeze(-1), dim=-4)
+
+        q_chunk_out = all_values / all_weights
+
+        o[..., q_s:q_s + q_chunk_size, :, :] = q_chunk_out
+
+    return o
diff --git a/openfold/tensor_utils.py b/openfold/tensor_utils.py
new file mode 100644
index 000000000000..7e5e8e4b6b5e
--- /dev/null
+++ b/openfold/tensor_utils.py
@@ -0,0 +1,408 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+import torch
+import torch.nn as nn
+from typing import Tuple, List, Callable, Any, Dict, Sequence, Optional
+
+
+def permute_final_dims(tensor: torch.Tensor, inds: List[int]):
+    zero_index = -1 * len(inds)
+    first_inds = list(range(len(tensor.shape[:zero_index])))
+    return tensor.permute(first_inds + [zero_index + i for i in inds])
+
+
+def flatten_final_dims(t: torch.Tensor, no_dims: int):
+    return t.reshape(t.shape[:-no_dims] + (-1,))
+
+
+def masked_mean(mask, value, dim, eps=1e-4):
+    mask = mask.expand(*value.shape)
+    return torch.sum(mask * value, dim=dim) / (eps + torch.sum(mask, dim=dim))
+
+
+def pts_to_distogram(pts, min_bin=2.3125, max_bin=21.6875, no_bins=64):
+    boundaries = torch.linspace(
+        min_bin, max_bin, no_bins - 1, device=pts.device
+    )
+    dists = torch.sqrt(
+        torch.sum((pts.unsqueeze(-2) - pts.unsqueeze(-3)) ** 2, dim=-1)
+    )
+    return torch.bucketize(dists, boundaries)
+
+
+def dict_multimap(fn, dicts):
+    first = dicts[0]
+    new_dict = {}
+    for k, v in first.items():
+        all_v = [d[k] for d in dicts]
+        if type(v) is dict:
+            new_dict[k] = dict_multimap(fn, all_v)
+        else:
+            new_dict[k] = fn(all_v)
+
+    return new_dict
+
+
+def one_hot(x, v_bins):
+    reshaped_bins = v_bins.view(((1,) * len(x.shape)) + (len(v_bins),))
+    diffs = x[..., None] - reshaped_bins
+    am = torch.argmin(torch.abs(diffs), dim=-1)
+    return nn.functional.one_hot(am, num_classes=len(v_bins)).float()
+
+
+def batched_gather(data, inds, dim=0, no_batch_dims=0):
+    ranges = []
+    for i, s in enumerate(data.shape[:no_batch_dims]):
+        r = torch.arange(s)
+        r = r.view(*(*((1,) * i), -1, *((1,) * (len(inds.shape) - i - 1))))
+        ranges.append(r)
+
+    remaining_dims = [
+        slice(None) for _ in range(len(data.shape) - no_batch_dims)
+    ]
+    remaining_dims[dim - no_batch_dims if dim >= 0 else dim] = inds
+    ranges.extend(remaining_dims)
+    return data[ranges]
+
+
+# With tree_map, a poor man's JAX tree_map
+def dict_map(fn, dic, leaf_type):
+    new_dict = {}
+    for k, v in dic.items():
+        if type(v) is dict:
+            new_dict[k] = dict_map(fn, v, leaf_type)
+        else:
+            new_dict[k] = tree_map(fn, v, leaf_type)
+
+    return new_dict
+
+
+def tree_map(fn, tree, leaf_type):
+    if isinstance(tree, dict):
+        return dict_map(fn, tree, leaf_type)
+    elif isinstance(tree, list):
+        return [tree_map(fn, x, leaf_type) for x in tree]
+    elif isinstance(tree, tuple):
+        return tuple([tree_map(fn, x, leaf_type) for x in tree])
+    elif isinstance(tree, leaf_type):
+        return fn(tree)
+    else:
+        print(type(tree))
+        raise ValueError("Not supported")
+
+
+tensor_tree_map = partial(tree_map, leaf_type=torch.Tensor)
+
+def _fetch_dims(tree):
+    shapes = []
+    tree_type = type(tree)
+    if tree_type is dict:
+        for v in tree.values():
+            shapes.extend(_fetch_dims(v))
+    elif tree_type is list or tree_type is tuple:
+        for t in tree:
+            shapes.extend(_fetch_dims(t))
+    elif tree_type is torch.Tensor:
+        shapes.append(tree.shape)
+    else:
+        raise ValueError("Not supported")
+
+    return shapes
+
+
+@torch.jit.ignore
+def _flat_idx_to_idx(
+    flat_idx: int,
+    dims: Tuple[int],
+) -> Tuple[int]:
+    idx = []
+    for d in reversed(dims):
+        idx.append(flat_idx % d)
+        flat_idx = flat_idx // d
+
+    return tuple(reversed(idx))
+
+
+@torch.jit.ignore
+def _get_minimal_slice_set(
+    start: Sequence[int],
+    end: Sequence[int],
+    dims: int,
+    start_edges: Optional[Sequence[bool]] = None,
+    end_edges: Optional[Sequence[bool]] = None,
+) -> Sequence[Tuple[int]]:
+    """ 
+        Produces an ordered sequence of tensor slices that, when used in
+        sequence on a tensor with shape dims, yields tensors that contain every
+        leaf in the contiguous range [start, end]. Care is taken to yield a 
+        short sequence of slices, and perhaps even the shortest possible (I'm 
+        pretty sure it's the latter).
+         
+        end is INCLUSIVE. 
+    """
+    # start_edges and end_edges both indicate whether, starting from any given
+    # dimension, the start/end index is at the top/bottom edge of the
+    # corresponding tensor, modeled as a tree
+    def reduce_edge_list(l):
+        tally = 1
+        for i in range(len(l)):
+            reversed_idx = -1 * (i + 1)
+            l[reversed_idx] *= tally
+            tally = l[reversed_idx]
+
+    if(start_edges is None):
+        start_edges = [s == 0 for s in start]
+        reduce_edge_list(start_edges)
+    if(end_edges is None):
+        end_edges = [e == (d - 1) for e,d in zip(end, dims)]
+        reduce_edge_list(end_edges)        
+
+    # Base cases. Either start/end are empty and we're done, or the final,
+    # one-dimensional tensor can be simply sliced
+    if(len(start) == 0):
+        return [tuple()]
+    elif(len(start) == 1):
+        return [(slice(start[0], end[0] + 1),)]
+
+    slices = []
+    path = []
+ 
+    # Dimensions common to start and end can be selected directly
+    for s,e in zip(start, end):
+        if(s == e):
+            path.append(slice(s, s + 1))
+        else:
+            break
+
+    path = tuple(path)
+    divergence_idx = len(path)
+
+    # start == end, and we're done
+    if(divergence_idx == len(dims)):
+        return [tuple(path)]
+
+    def upper():
+        sdi = start[divergence_idx]
+        return [
+            path + (slice(sdi, sdi + 1),) + s for s in 
+            _get_minimal_slice_set(
+                start[divergence_idx + 1:],
+                [d - 1 for d in dims[divergence_idx + 1:]],
+                dims[divergence_idx + 1:],
+                start_edges=start_edges[divergence_idx + 1:],
+                end_edges=[1 for _ in end_edges[divergence_idx + 1:]]
+            )
+        ]
+
+    def lower():
+        edi = end[divergence_idx]
+        return [
+            path + (slice(edi, edi + 1),) + s for s in 
+            _get_minimal_slice_set(
+                [0 for _ in start[divergence_idx + 1:]],
+                end[divergence_idx + 1:],
+                dims[divergence_idx + 1:],
+                start_edges=[1 for _ in start_edges[divergence_idx + 1:]],
+                end_edges=end_edges[divergence_idx + 1:],
+            )
+        ]
+
+    # If both start and end are at the edges of the subtree rooted at
+    # divergence_idx, we can just select the whole subtree at once
+    if(start_edges[divergence_idx] and end_edges[divergence_idx]):
+        slices.append(
+            path + (slice(start[divergence_idx], end[divergence_idx] + 1),)
+        )
+    # If just start is at the edge, we can grab almost all of the subtree, 
+    # treating only the ragged bottom edge as an edge case
+    elif(start_edges[divergence_idx]):
+        slices.append(
+            path + (slice(start[divergence_idx], end[divergence_idx]),)
+        )
+        slices.extend(lower())
+    # Analogous to the previous case, but the top is ragged this time
+    elif(end_edges[divergence_idx]):
+        slices.extend(upper())
+        slices.append(
+            path + (slice(start[divergence_idx] + 1, end[divergence_idx] + 1),)
+        )
+    # If both sides of the range are ragged, we need to handle both sides
+    # separately. If there's contiguous meat in between them, we can index it
+    # in one big chunk
+    else:
+        slices.extend(upper())
+        middle_ground = end[divergence_idx] - start[divergence_idx]
+        if(middle_ground > 1):
+            slices.append(
+                path + (slice(start[divergence_idx] + 1, end[divergence_idx]),)
+            )
+        slices.extend(lower())
+
+    return [tuple(s) for s in slices]
+
+
+@torch.jit.ignore
+def _chunk_slice(
+    t: torch.Tensor,
+    flat_start: int,
+    flat_end: int,
+    no_batch_dims: int,
+) -> torch.Tensor:
+    """
+        Equivalent to
+        
+            t.reshape((-1,) + t.shape[no_batch_dims:])[flat_start:flat_end]
+
+        but without the need for the initial reshape call, which can be 
+        memory-intensive in certain situations. The only reshape operations
+        in this function are performed on sub-tensors that scale with
+        (flat_end - flat_start), the chunk size.
+    """
+
+    batch_dims = t.shape[:no_batch_dims]
+    start_idx = list(_flat_idx_to_idx(flat_start, batch_dims))
+    # _get_minimal_slice_set is inclusive
+    end_idx = list(_flat_idx_to_idx(flat_end - 1, batch_dims))
+
+    # Get an ordered list of slices to perform
+    slices = _get_minimal_slice_set(
+        start_idx,
+        end_idx,
+        batch_dims,
+    )
+
+    sliced_tensors = [t[s] for s in slices]
+
+    return torch.cat(
+        [s.view((-1,) + t.shape[no_batch_dims:]) for s in sliced_tensors]
+    )
+
+
+def chunk_layer(
+    layer: Callable,
+    inputs: Dict[str, Any],
+    chunk_size: int,
+    no_batch_dims: int,
+    low_mem: bool = False, 
+) -> Any:
+    """
+    Implements the "chunking" procedure described in section 1.11.8.
+
+    Layer outputs and inputs are assumed to be simple "pytrees,"
+    consisting only of (arbitrarily nested) lists, tuples, and dicts with
+    torch.Tensor leaves.
+
+    Args:
+        layer:
+            The layer to be applied chunk-wise
+        inputs:
+            A (non-nested) dictionary of keyworded inputs. All leaves must
+            be tensors and must share the same batch dimensions.
+        chunk_size:
+            The number of sub-batches per chunk. If multiple batch
+            dimensions are specified, a "sub-batch" is defined as a single
+            indexing of all batch dimensions simultaneously (s.t. the
+            number of sub-batches is the product of the batch dimensions).
+        no_batch_dims:
+            How many of the initial dimensions of each input tensor can
+            be considered batch dimensions.
+        low_mem:
+            Avoids flattening potentially large input tensors. Unnecessary
+            in most cases, and is ever so slightly slower than the default
+            setting.
+    Returns:
+        The reassembled output of the layer on the inputs.
+    """
+    if not (len(inputs) > 0):
+        raise ValueError("Must provide at least one input")
+
+    initial_dims = [shape[:no_batch_dims] for shape in _fetch_dims(inputs)]
+    orig_batch_dims = tuple([max(s) for s in zip(*initial_dims)])
+
+    def _prep_inputs(t):
+        # TODO: make this more memory efficient. This sucks
+        if(not low_mem):
+            if not sum(t.shape[:no_batch_dims]) == no_batch_dims:
+                t = t.expand(orig_batch_dims + t.shape[no_batch_dims:])
+            t = t.reshape(-1, *t.shape[no_batch_dims:])
+        else:
+            t = t.expand(orig_batch_dims + t.shape[no_batch_dims:])
+        return t
+
+    prepped_inputs = tensor_tree_map(_prep_inputs, inputs)
+
+    flat_batch_dim = 1
+    for d in orig_batch_dims:
+        flat_batch_dim *= d
+
+    no_chunks = flat_batch_dim // chunk_size + (
+        flat_batch_dim % chunk_size != 0
+    )
+
+    i = 0
+    out = None
+    for _ in range(no_chunks):
+        # Chunk the input
+        if(not low_mem):
+            select_chunk = (
+                lambda t: t[i : i + chunk_size] if t.shape[0] != 1 else t
+            )
+        else:
+            select_chunk = (
+                partial(
+                    _chunk_slice, 
+                    flat_start=i, 
+                    flat_end=min(flat_batch_dim, i + chunk_size), 
+                    no_batch_dims=len(orig_batch_dims)
+                )
+            )
+
+        chunks = tensor_tree_map(select_chunk, prepped_inputs)
+
+        # Run the layer on the chunk
+        output_chunk = layer(**chunks)
+
+        # Allocate space for the output
+        if out is None:
+            allocate = lambda t: t.new_zeros((flat_batch_dim,) + t.shape[1:])
+            out = tensor_tree_map(allocate, output_chunk)
+
+        # Put the chunk in its pre-allocated space
+        out_type = type(output_chunk)
+        if out_type is dict:
+            def assign(d1, d2):
+                for k, v in d1.items():
+                    if type(v) is dict:
+                        assign(v, d2[k])
+                    else:
+                        v[i : i + chunk_size] = d2[k]
+
+            assign(out, output_chunk)
+        elif out_type is tuple:
+            for x1, x2 in zip(out, output_chunk):
+                x1[i : i + chunk_size] = x2
+        elif out_type is torch.Tensor:
+            out[i : i + chunk_size] = output_chunk
+        else:
+            raise ValueError("Not supported")
+
+        i += chunk_size
+
+    reshape = lambda t: t.view(orig_batch_dims + t.shape[1:])
+    out = tensor_tree_map(reshape, out)
+
+    return out
diff --git a/openfold/triangular_attention.py b/openfold/triangular_attention.py
new file mode 100644
index 000000000000..6d3e37f4c681
--- /dev/null
+++ b/openfold/triangular_attention.py
@@ -0,0 +1,139 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partialmethod, partial
+import math
+from typing import Optional, List
+
+import torch
+import torch.nn as nn
+
+from openfold.primitives import Linear, LayerNorm, Attention
+from openfold.tensor_utils import (
+    chunk_layer,
+    permute_final_dims,
+    flatten_final_dims,
+)
+
+
+class TriangleAttention(nn.Module):
+    def __init__(
+        self, c_in, c_hidden, no_heads, starting, inf=1e9
+    ):
+        """
+        Args:
+            c_in:
+                Input channel dimension
+            c_hidden:
+                Overall hidden channel dimension (not per-head)
+            no_heads:
+                Number of attention heads
+        """
+        super(TriangleAttention, self).__init__()
+
+        self.c_in = c_in
+        self.c_hidden = c_hidden
+        self.no_heads = no_heads
+        self.starting = starting
+        self.inf = inf
+
+        self.layer_norm = LayerNorm(self.c_in)
+
+        self.linear = Linear(c_in, self.no_heads, bias=False, init="normal")
+
+        self.mha = Attention(
+            self.c_in, self.c_in, self.c_in, self.c_hidden, self.no_heads
+        )
+
+    @torch.jit.ignore
+    def _chunk(self,
+        x: torch.Tensor,
+        biases: List[torch.Tensor],
+        chunk_size: int,
+    ) -> torch.Tensor:
+        mha_inputs = {
+            "q_x": x,
+            "kv_x": x,
+            "biases": biases,
+        }
+        return chunk_layer(
+            partial(self.mha),
+            mha_inputs,
+            chunk_size=chunk_size,
+            no_batch_dims=len(x.shape[:-2]),
+        )
+
+    def forward(self, 
+        x: torch.Tensor, 
+        mask: Optional[torch.Tensor] = None,
+        chunk_size: Optional[int] = None
+    ) -> torch.Tensor:
+        """
+        Args:
+            x:
+                [*, I, J, C_in] input tensor (e.g. the pair representation)
+        Returns:
+            [*, I, J, C_in] output tensor
+        """
+        if mask is None:
+            # [*, I, J]
+            mask = x.new_ones(
+                x.shape[:-1],
+            )
+
+        # Shape annotations assume self.starting. Else, I and J are flipped
+        if not self.starting:
+            x = x.transpose(-2, -3)
+            mask = mask.transpose(-1, -2)
+
+        # [*, I, J, C_in]
+        x = self.layer_norm(x)
+
+        # [*, I, 1, 1, J]
+        mask_bias = (self.inf * (mask - 1))[..., :, None, None, :]
+
+        # [*, H, I, J]
+        triangle_bias = permute_final_dims(self.linear(x), (2, 0, 1))
+
+        # [*, 1, H, I, J]
+        triangle_bias = triangle_bias.unsqueeze(-4)
+
+        biases = [mask_bias, triangle_bias]
+
+        if chunk_size is not None:
+            x = self._chunk(x, biases, chunk_size)
+        else:
+            x = self.mha(q_x=x, kv_x=x, biases=biases)
+
+        if not self.starting:
+            x = x.transpose(-2, -3)
+
+        return x
+
+
+class TriangleAttentionStartingNode(TriangleAttention):
+    """
+    Implements Algorithm 13.
+    """
+
+    __init__ = partialmethod(TriangleAttention.__init__, starting=True)
+
+
+class TriangleAttentionEndingNode(TriangleAttention):
+    """
+    Implements Algorithm 14.
+    """
+
+    __init__ = partialmethod(TriangleAttention.__init__, starting=False)
diff --git a/openfold/triangular_multiplicative_update.py b/openfold/triangular_multiplicative_update.py
new file mode 100644
index 000000000000..2406e2bac2cf
--- /dev/null
+++ b/openfold/triangular_multiplicative_update.py
@@ -0,0 +1,127 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partialmethod
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from openfold.primitives import Linear, LayerNorm
+from openfold.tensor_utils import permute_final_dims
+
+
+class TriangleMultiplicativeUpdate(nn.Module):
+    """
+    Implements Algorithms 11 and 12.
+    """
+    def __init__(self, c_z, c_hidden, _outgoing=True):
+        """
+        Args:
+            c_z:
+                Input channel dimension
+            c:
+                Hidden channel dimension
+        """
+        super(TriangleMultiplicativeUpdate, self).__init__()
+        self.c_z = c_z
+        self.c_hidden = c_hidden
+        self._outgoing = _outgoing
+
+        self.linear_a_p = Linear(self.c_z, self.c_hidden)
+        self.linear_a_g = Linear(self.c_z, self.c_hidden, init="gating")
+        self.linear_b_p = Linear(self.c_z, self.c_hidden)
+        self.linear_b_g = Linear(self.c_z, self.c_hidden, init="gating")
+        self.linear_g = Linear(self.c_z, self.c_z, init="gating")
+        self.linear_z = Linear(self.c_hidden, self.c_z, init="final")
+
+        self.layer_norm_in = LayerNorm(self.c_z)
+        self.layer_norm_out = LayerNorm(self.c_hidden)
+
+        self.sigmoid = nn.Sigmoid()
+
+    def _combine_projections(self,
+        a: torch.Tensor,
+        b: torch.Tensor,
+    ) -> torch.Tensor:
+        raise NotImplementedError("This method needs to be overridden")
+
+    def forward(self, 
+        z: torch.Tensor, 
+        mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """
+        Args:
+            x:
+                [*, N_res, N_res, C_z] input tensor
+            mask:
+                [*, N_res, N_res] input mask
+        Returns:
+            [*, N_res, N_res, C_z] output tensor
+        """
+        if mask is None:
+            mask = z.new_ones(z.shape[:-1])
+
+        mask = mask.unsqueeze(-1)
+
+        z = self.layer_norm_in(z)
+        a = self.linear_a_p(z) * self.sigmoid(self.linear_a_g(z))
+        a = a * mask
+        b = self.linear_b_p(z) * self.sigmoid(self.linear_b_g(z))
+        b = b * mask
+        x = self._combine_projections(a, b)
+        x = self.layer_norm_out(x)
+        x = self.linear_z(x)
+        g = self.sigmoid(self.linear_g(z))
+        z = x * g
+
+        return z
+
+
+class TriangleMultiplicationOutgoing(TriangleMultiplicativeUpdate):
+    """
+    Implements Algorithm 11.
+    """
+    def _combine_projections(self,
+        a: torch.Tensor,  # [*, N_i, N_k, C]
+        b: torch.Tensor,  # [*, N_j, N_k, C]
+    ):
+        # [*, C, N_i, N_j]
+        p = torch.matmul(
+            permute_final_dims(a, (2, 0, 1)),
+            permute_final_dims(b, (2, 1, 0)),
+        )
+
+        # [*, N_i, N_j, C]
+        return permute_final_dims(p, (1, 2, 0))
+
+
+class TriangleMultiplicationIncoming(TriangleMultiplicativeUpdate):
+    """
+    Implements Algorithm 12.
+    """
+    def _combine_projections(self,
+        a: torch.Tensor,  # [*, N_k, N_i, C]
+        b: torch.Tensor,  # [*, N_k, N_j, C]
+    ):
+        # [*, C, N_i, N_j]
+        p = torch.matmul(
+            permute_final_dims(a, (2, 1, 0)),
+            permute_final_dims(b, (2, 0, 1)),
+        )
+
+        # [*, N_i, N_j, C]
+        return permute_final_dims(p, (1, 2, 0))
+

From 1d7ca02301c9ff71953070ea963b8e107fa4ccb6 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 29 Dec 2022 14:28:38 +0800
Subject: [PATCH 066/209] add benchmark

---
 autochunk_benchmark.py | 79 ++++++++++++++++++++++++++++++++++++++++++
 chunk_codegen.py       | 16 +++++----
 2 files changed, 89 insertions(+), 6 deletions(-)
 create mode 100644 autochunk_benchmark.py

diff --git a/autochunk_benchmark.py b/autochunk_benchmark.py
new file mode 100644
index 000000000000..a34464212e02
--- /dev/null
+++ b/autochunk_benchmark.py
@@ -0,0 +1,79 @@
+import copy
+import torch
+import torch.nn.functional as F
+import pytest
+import torch.fx
+import torch.multiprocessing as mp
+from torch.fx import GraphModule
+from colossalai.fx import ColoTracer
+import colossalai
+from colossalai.utils import free_port
+from colossalai.core import global_context as gpc
+from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp, TensorMetadata
+from colossalai.fx.profiler import MetaTensor
+from evoformer.evoformer import evoformer_base
+from chunk_codegen import ChunkCodeGen
+import time
+
+
+def _benchmark_evoformer(model: torch.nn.Module, node, pair):
+    loop = 10
+    with torch.no_grad():
+        for _ in range(loop // 4):
+            model(node, pair)
+        torch.cuda.synchronize()
+        time1 = time.time()
+        for _ in range(loop):
+            model(node, pair)
+        torch.cuda.synchronize()
+        time2 = time.time()
+    return (time2 - time1) / loop
+
+
+def benchmark_evoformer():
+    # data
+    msa_len = 300
+    pair_len = 800
+    node = torch.randn(1, msa_len, pair_len, 256).cuda()
+    pair = torch.randn(1, pair_len, pair_len, 128).cuda()
+
+    # build gm model
+    max_memory = 3000  # MB
+    model = evoformer_base().cuda()
+    # trace the module and replace codegen
+    graph = ColoTracer().trace(
+        model,
+        meta_args={
+            "node": node.to(torch.device("meta")),
+            "pair": pair.to(torch.device("meta")),
+        },
+    )
+    gm_prop = torch.fx.symbolic_trace(model)  # must use symbolic_trace
+    interp = MetaInfoProp(gm_prop)
+    interp.propagate(
+        MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
+    )
+    # now run it twice to get meta info in graph module, not necessary
+    gm = torch.fx.GraphModule(model, graph)
+    interp = MetaInfoProp(gm)
+    interp.propagate(
+        MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
+    )
+    # set code_gen
+    codegen = ChunkCodeGen(gm_prop, max_memory)
+    graph.set_codegen(codegen)
+    gm = ColoGraphModule(model, graph)
+    gm.recompile()
+    # print
+    code = graph.python_code("self").src
+    print(code)
+
+    time_gm = _benchmark_evoformer(gm, node, pair)
+    print("gm %.4fs" % time_gm)
+    time_openfold = _benchmark_evoformer(model, node, pair)
+    print("openfold %.4fs" % time_openfold)
+
+
+if __name__ == "__main__":
+    benchmark_evoformer()
diff --git a/chunk_codegen.py b/chunk_codegen.py
index 6caed88d84d2..033db50dbccb 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1398,13 +1398,14 @@ def estimate_chunk_inference_mem(
 
 class ChunkSelector(object):
     def __init__(
-        self, index_tracer: IndexTracer, memory_estimator: MemoryEstimator, stratge
+        self, index_tracer: IndexTracer, memory_estimator: MemoryEstimator, stratge, max_memory=None
     ):
         self.index_tracer = index_tracer
         self.memory_estimator = memory_estimator
         assert stratge in ["min_memory", "fit_memory"]
+        assert (stratge == "fit_memory" and max_memory is not None) or stratge != "fit_memory"
         self.stratge = stratge
-        self.max_memory = 600  # MB
+        self.max_memory = max_memory  # MB
 
     def _select_best_chunk_region(
         self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
@@ -1556,13 +1557,13 @@ def _is_legal_region(self, cur_chunk_info, chunk_infos):
 
 
 class ChunkRegionSearch(object):
-    def __init__(self, gm) -> None:
+    def __init__(self, gm, max_memory=None) -> None:
         self.gm = gm
         self.index_tracer = IndexTracer(list(gm.graph.nodes))
         self.index_tracer.trace_index()
         self.memory_estimator = MemoryEstimator(self.index_tracer)
         self.chunk_selector = ChunkSelector(
-            self.index_tracer, self.memory_estimator, stratge="fit_memory"
+            self.index_tracer, self.memory_estimator, stratge="fit_memory", max_memory=max_memory
         )
 
     def _find_peak_node(self, mem_peak):
@@ -1897,6 +1898,7 @@ def emit_code_with_chunk(
     delete_unused_value_func,
     meta_nodes,
     meta_graph,
+    max_memory=None,
 ):
     """Emit code with nested activation checkpoint
     When we detect some of the node.activation_checkpoint is a List, we will use
@@ -1912,7 +1914,7 @@ def emit_code_with_chunk(
     node_list = list(nodes)
 
     # find the chunk regions
-    chunk_region_search = ChunkRegionSearch(meta_graph)
+    chunk_region_search = ChunkRegionSearch(meta_graph, max_memory)
     chunk_search = chunk_region_search.search_region()
 
     chunk_regions = [i["region"] for i in chunk_search]
@@ -1989,9 +1991,10 @@ def emit_code_with_chunk(
 if CODEGEN_AVAILABLE:
 
     class ChunkCodeGen(CodeGen):
-        def __init__(self, meta_graph):
+        def __init__(self, meta_graph, max_memory=None):
             super().__init__()
             self.meta_graph = meta_graph
+            self.max_memory = max_memory
             self.meta_node = list(meta_graph.graph.nodes)
 
         def _gen_python_code(
@@ -2230,6 +2233,7 @@ def emit_node(node: Node, body):
                 delete_unused_values,
                 self.meta_node,
                 self.meta_graph,
+                self.max_memory
             )
 
             if len(body) == 0:

From 5a916c0adb320b4a1cfc96e8a40364fb62a0a463 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 29 Dec 2022 14:42:29 +0800
Subject: [PATCH 067/209] add print

---
 autochunk_benchmark.py | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/autochunk_benchmark.py b/autochunk_benchmark.py
index a34464212e02..0c55a3a8848c 100644
--- a/autochunk_benchmark.py
+++ b/autochunk_benchmark.py
@@ -1,24 +1,21 @@
-import copy
+import time
+
 import torch
-import torch.nn.functional as F
-import pytest
 import torch.fx
-import torch.multiprocessing as mp
-from torch.fx import GraphModule
+
+from chunk_codegen import ChunkCodeGen
 from colossalai.fx import ColoTracer
-import colossalai
-from colossalai.utils import free_port
-from colossalai.core import global_context as gpc
 from colossalai.fx.graph_module import ColoGraphModule
-from colossalai.fx.passes.meta_info_prop import MetaInfoProp, TensorMetadata
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 from colossalai.fx.profiler import MetaTensor
 from evoformer.evoformer import evoformer_base
-from chunk_codegen import ChunkCodeGen
-import time
 
 
-def _benchmark_evoformer(model: torch.nn.Module, node, pair):
-    loop = 10
+def _benchmark_evoformer(model: torch.nn.Module, node, pair, title):
+    torch.cuda.reset_peak_memory_stats()
+    now_mem = torch.cuda.memory_allocated() / 1024**2
+
+    loop = 16
     with torch.no_grad():
         for _ in range(loop // 4):
             model(node, pair)
@@ -28,7 +25,12 @@ def _benchmark_evoformer(model: torch.nn.Module, node, pair):
             model(node, pair)
         torch.cuda.synchronize()
         time2 = time.time()
-    return (time2 - time1) / loop
+
+    new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
+    print(
+        "%s: time %.4fs, mem %dMB"
+        % (title, (time2 - time1) / loop, new_max_mem - now_mem)
+    )
 
 
 def benchmark_evoformer():
@@ -69,10 +71,8 @@ def benchmark_evoformer():
     code = graph.python_code("self").src
     print(code)
 
-    time_gm = _benchmark_evoformer(gm, node, pair)
-    print("gm %.4fs" % time_gm)
-    time_openfold = _benchmark_evoformer(model, node, pair)
-    print("openfold %.4fs" % time_openfold)
+    _benchmark_evoformer(gm, node, pair, "autochunk")
+    _benchmark_evoformer(model, node, pair, "openfold")
 
 
 if __name__ == "__main__":

From 7a23deb58455b112cf187776857e2a262d0b737e Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 29 Dec 2022 14:47:16 +0800
Subject: [PATCH 068/209] code style

---
 autochunk_benchmark.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/autochunk_benchmark.py b/autochunk_benchmark.py
index 0c55a3a8848c..f8e603f4ee63 100644
--- a/autochunk_benchmark.py
+++ b/autochunk_benchmark.py
@@ -34,15 +34,23 @@ def _benchmark_evoformer(model: torch.nn.Module, node, pair, title):
 
 
 def benchmark_evoformer():
-    # data
+    # init data and model
     msa_len = 300
     pair_len = 800
     node = torch.randn(1, msa_len, pair_len, 256).cuda()
     pair = torch.randn(1, pair_len, pair_len, 128).cuda()
+    model = evoformer_base().cuda()
 
-    # build gm model
+    # build autochunk model
     max_memory = 3000  # MB
-    model = evoformer_base().cuda()
+    autochunk = _build_autochunk(model, max_memory, node, pair)
+
+    # benchmark
+    _benchmark_evoformer(model, node, pair, "openfold")
+    _benchmark_evoformer(autochunk, node, pair, "autochunk")
+
+
+def _build_autochunk(model, max_memory, node, pair):
     # trace the module and replace codegen
     graph = ColoTracer().trace(
         model,
@@ -70,9 +78,7 @@ def benchmark_evoformer():
     # print
     code = graph.python_code("self").src
     print(code)
-
-    _benchmark_evoformer(gm, node, pair, "autochunk")
-    _benchmark_evoformer(model, node, pair, "openfold")
+    return gm
 
 
 if __name__ == "__main__":

From efe6fe3a33c4b8c50c2e964188fef72d1f269cfd Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 29 Dec 2022 14:47:47 +0800
Subject: [PATCH 069/209] code style

---
 autochunk_benchmark.py | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/autochunk_benchmark.py b/autochunk_benchmark.py
index f8e603f4ee63..20f615b216f7 100644
--- a/autochunk_benchmark.py
+++ b/autochunk_benchmark.py
@@ -33,23 +33,6 @@ def _benchmark_evoformer(model: torch.nn.Module, node, pair, title):
     )
 
 
-def benchmark_evoformer():
-    # init data and model
-    msa_len = 300
-    pair_len = 800
-    node = torch.randn(1, msa_len, pair_len, 256).cuda()
-    pair = torch.randn(1, pair_len, pair_len, 128).cuda()
-    model = evoformer_base().cuda()
-
-    # build autochunk model
-    max_memory = 3000  # MB
-    autochunk = _build_autochunk(model, max_memory, node, pair)
-
-    # benchmark
-    _benchmark_evoformer(model, node, pair, "openfold")
-    _benchmark_evoformer(autochunk, node, pair, "autochunk")
-
-
 def _build_autochunk(model, max_memory, node, pair):
     # trace the module and replace codegen
     graph = ColoTracer().trace(
@@ -81,5 +64,22 @@ def _build_autochunk(model, max_memory, node, pair):
     return gm
 
 
+def benchmark_evoformer():
+    # init data and model
+    msa_len = 300
+    pair_len = 800
+    node = torch.randn(1, msa_len, pair_len, 256).cuda()
+    pair = torch.randn(1, pair_len, pair_len, 128).cuda()
+    model = evoformer_base().cuda()
+
+    # build autochunk model
+    max_memory = 3000  # MB
+    autochunk = _build_autochunk(model, max_memory, node, pair)
+
+    # benchmark
+    _benchmark_evoformer(model, node, pair, "openfold")
+    _benchmark_evoformer(autochunk, node, pair, "autochunk")
+
+
 if __name__ == "__main__":
     benchmark_evoformer()

From 289f3a45c24233fec28af6d5651b3099b55ace8b Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 29 Dec 2022 15:01:15 +0800
Subject: [PATCH 070/209] init openfold

---
 evoformer_openfold/evoformer.py   |  59 +++++++++
 evoformer_openfold/initializer.py |  29 +++++
 evoformer_openfold/kernel.py      |  19 +++
 evoformer_openfold/msa.py         |  95 +++++++++++++++
 evoformer_openfold/ops.py         | 176 +++++++++++++++++++++++++++
 evoformer_openfold/triangle.py    | 192 ++++++++++++++++++++++++++++++
 6 files changed, 570 insertions(+)
 create mode 100644 evoformer_openfold/evoformer.py
 create mode 100755 evoformer_openfold/initializer.py
 create mode 100644 evoformer_openfold/kernel.py
 create mode 100644 evoformer_openfold/msa.py
 create mode 100755 evoformer_openfold/ops.py
 create mode 100644 evoformer_openfold/triangle.py

diff --git a/evoformer_openfold/evoformer.py b/evoformer_openfold/evoformer.py
new file mode 100644
index 000000000000..cfd2bb2a2529
--- /dev/null
+++ b/evoformer_openfold/evoformer.py
@@ -0,0 +1,59 @@
+import torch
+import torch.nn as nn
+
+from .msa import MSAStack
+from .ops import OutProductMean
+from .triangle import PairStack
+
+
+def print_memory(init_mem, text=None):
+    now_mem = torch.cuda.memory_allocated() / 1024 ** 2 - init_mem
+    max_mem = torch.cuda.max_memory_allocated() / 1024 ** 2 - init_mem
+    print("%s now:%.2f max:%.2f" % ("" if text is None else text, now_mem, max_mem))
+    torch.cuda.reset_peak_memory_stats()
+
+
+class EvoformerBlock(nn.Module):
+
+    def __init__(self, d_node, d_pair):
+        super(EvoformerBlock, self).__init__()
+
+        self.msa_stack = MSAStack(d_node, d_pair, p_drop=0.15)
+        self.communication = OutProductMean(n_feat=d_node, n_feat_out=d_pair, n_feat_proj=32)
+        self.pair_stack = PairStack(d_pair=d_pair)
+
+    def forward(self, node, pair):
+        node = self.msa_stack(node, pair)
+        pair = pair + self.communication(node)
+        pair = self.pair_stack(pair)
+        return node, pair
+
+
+class Evoformer(nn.Module):
+
+    def __init__(self, d_node, d_pair):
+        super(Evoformer, self).__init__()
+
+        self.blocks = nn.ModuleList()
+        for _ in range(1):
+            self.blocks.append(EvoformerBlock(d_node, d_pair))
+
+    def forward(self, node, pair):
+        for b in self.blocks:
+            node, pair = b(node, pair)
+        return node, pair
+
+
+def evoformer_tiny():
+    return Evoformer(d_node=64, d_pair=32)
+
+
+def evoformer_base():
+    return Evoformer(d_node=256, d_pair=128)
+
+
+def evoformer_large():
+    return Evoformer(d_node=512, d_pair=256)
+
+
+__all__ = ['Evoformer', 'evoformer_base', 'evoformer_large']
diff --git a/evoformer_openfold/initializer.py b/evoformer_openfold/initializer.py
new file mode 100755
index 000000000000..c6ce0659e597
--- /dev/null
+++ b/evoformer_openfold/initializer.py
@@ -0,0 +1,29 @@
+import math
+
+import numpy as np
+import torch.nn as nn
+
+
+def glorot_uniform_af(x, gain=1.0):
+    """
+    initialize tensors the same as xavier_initializer in PyTorch, but the dimensions are different:
+    In PyTorch:
+    [feature_out, feature_in, n_head ...]
+    In Jax:
+    [... n_head, feature_in, feature_out]
+    However, there is a feature in original Alphafold2 code that they use the Jax version initializer to initialize tensors like:
+    [feature_in, n_head, feature_out]
+
+    In this function, we keep this feature to initialize [feature_in, n_head, ..., feature_out] tensors
+    """
+    fan_in, fan_out = x.shape[-2:]
+    if len(x.shape) > 2:
+        receptive_field_size = np.prod(x.shape[:-2])
+        fan_in *= receptive_field_size
+        fan_out *= receptive_field_size
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    dev = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
+
+    nn.init.uniform_(x, -dev, dev)
+
+    return x
diff --git a/evoformer_openfold/kernel.py b/evoformer_openfold/kernel.py
new file mode 100644
index 000000000000..26ab5dc53261
--- /dev/null
+++ b/evoformer_openfold/kernel.py
@@ -0,0 +1,19 @@
+import torch
+import torch.nn.functional as F
+
+
+def bias_sigmod_ele(y, bias, z):
+    return torch.sigmoid(y + bias) * z
+
+
+def bias_dropout_add(x: torch.Tensor, bias: torch.Tensor, dropmask: torch.Tensor,
+                     residual: torch.Tensor, prob: float) -> torch.Tensor:
+    out = (x + bias) * F.dropout(dropmask, p=prob, training=False)
+    out = residual + out
+    return out
+
+
+def bias_ele_dropout_residual(ab: torch.Tensor, b: torch.Tensor, g: torch.Tensor,
+                              dropout_mask: torch.Tensor, Z_raw: torch.Tensor,
+                              prob: float) -> torch.Tensor:
+    return Z_raw + F.dropout(dropout_mask, p=prob, training=True) * (g * (ab + b))
\ No newline at end of file
diff --git a/evoformer_openfold/msa.py b/evoformer_openfold/msa.py
new file mode 100644
index 000000000000..cac456638a55
--- /dev/null
+++ b/evoformer_openfold/msa.py
@@ -0,0 +1,95 @@
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn import LayerNorm
+
+from .kernel import bias_dropout_add
+from .ops import SelfAttention, Transition
+
+
+class MSARowAttentionWithPairBias(nn.Module):
+
+    def __init__(self, d_node, d_pair, c=32, n_head=8, p_drop=0.15):
+        super(MSARowAttentionWithPairBias, self).__init__()
+        self.d_node = d_node
+        self.d_pair = d_pair
+        self.c = c
+        self.n_head = n_head
+        self.p_drop = p_drop
+
+        self.layernormM = LayerNorm(d_node)
+        self.layernormZ = LayerNorm(d_pair)
+
+        _init_weights = torch.nn.init.normal_(torch.zeros([n_head, d_pair]),
+                                              std=1.0 / math.sqrt(d_pair))
+        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights, requires_grad=True)
+
+        self.attention = SelfAttention(qkv_dim=d_node,
+                                       c=c,
+                                       n_head=n_head,
+                                       out_dim=d_node,
+                                       gating=True,
+                                       last_bias_fuse=True)
+
+        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_node,)), requires_grad=True)
+
+    def forward(self, M_raw, Z):
+        ## Input projections
+        M = self.layernormM(M_raw)
+        Z = self.layernormZ(Z)
+        b = F.linear(Z, self.linear_b_weights)
+        b = b.permute(0, 3, 1, 2)
+        # b = rearrange(b, 'b q k h -> b h q k')
+
+        M = self.attention(M, b)
+        dropout_mask = torch.ones_like(M[:, 0:1, :, :]).to(M.device).to(M.dtype)
+
+        return bias_dropout_add(M, self.out_bias, dropout_mask, M_raw, prob=self.p_drop)
+
+
+class MSAColumnAttention(nn.Module):
+
+    def __init__(self, d_node, c=32, n_head=8):
+        super(MSAColumnAttention, self).__init__()
+        self.d_node = d_node
+        self.c = c
+        self.n_head = n_head
+
+        self.layernormM = LayerNorm(d_node)
+        self.attention = SelfAttention(qkv_dim=d_node,
+                                       c=c,
+                                       n_head=n_head,
+                                       out_dim=d_node,
+                                       gating=True)
+
+    def forward(self, M_raw):
+        M = M_raw.transpose(-2, -3)
+        M = self.layernormM(M)
+
+        M = self.attention(M)
+
+        M = M.transpose(-2, -3)
+        return M_raw + M
+
+
+class MSAStack(nn.Module):
+
+    def __init__(self, d_node, d_pair, p_drop=0.15):
+        super(MSAStack, self).__init__()
+
+        self.MSARowAttentionWithPairBias = MSARowAttentionWithPairBias(d_node=d_node,
+                                                                       d_pair=d_pair,
+                                                                       p_drop=p_drop)
+
+        self.MSAColumnAttention = MSAColumnAttention(d_node=d_node)
+        self.MSATransition = Transition(d=d_node)
+
+    def forward(self, node, pair):
+        node = self.MSARowAttentionWithPairBias(node, pair)
+        node = self.MSAColumnAttention(node)
+        node = self.MSATransition(node)
+
+        return node
diff --git a/evoformer_openfold/ops.py b/evoformer_openfold/ops.py
new file mode 100755
index 000000000000..611b7b0fe777
--- /dev/null
+++ b/evoformer_openfold/ops.py
@@ -0,0 +1,176 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn import LayerNorm
+
+from .initializer import glorot_uniform_af
+from .kernel import bias_sigmod_ele
+
+
+class DropoutRowwise(nn.Module):
+
+    def __init__(self, p):
+        super(DropoutRowwise, self).__init__()
+        self.p = p
+        self.dropout = nn.Dropout(p=p)
+
+    def forward(self, x):
+        dropout_mask = torch.ones_like(x[:, 0:1, :, :])
+        dropout_mask = self.dropout(dropout_mask)
+        return dropout_mask * x
+
+
+class DropoutColumnwise(nn.Module):
+
+    def __init__(self, p):
+        super(DropoutColumnwise, self).__init__()
+        self.p = p
+        self.dropout = nn.Dropout(p=p)
+
+    def forward(self, x):
+        dropout_mask = torch.ones_like(x[:, :, 0:1, :])
+        dropout_mask = self.dropout(dropout_mask)
+        return dropout_mask * x
+
+
+class Transition(nn.Module):
+
+    def __init__(self, d, n=4):
+        super(Transition, self).__init__()
+        self.norm = LayerNorm(d)
+        self.linear1 = Linear(d, n * d, initializer='relu')
+        self.linear2 = Linear(n * d, d, initializer='zeros')
+
+    def forward(self, src):
+        x = self.norm(src)
+        x = self.linear2(F.relu(self.linear1(x)))
+        return src + x
+
+
+class OutProductMean(nn.Module):
+
+    def __init__(self, n_feat=64, n_feat_out=128, n_feat_proj=32):
+        super(OutProductMean, self).__init__()
+
+        self.layernormM = LayerNorm(n_feat)
+        self.linear_a = Linear(n_feat, n_feat_proj)
+        self.linear_b = Linear(n_feat, n_feat_proj)
+
+        self.o_linear = Linear(n_feat_proj * n_feat_proj,
+                               n_feat_out,
+                               initializer='zero',
+                               use_bias=True)
+
+    def forward(self, M):
+        M = self.layernormM(M)
+        left_act = self.linear_a(M)
+        right_act = self.linear_b(M)
+
+        O = torch.einsum('bsid,bsje->bijde', left_act, right_act).contiguous()
+        # O = rearrange(O, 'b i j d e -> b i j (d e)')
+        O = O.reshape(O.shape[0], O.shape[1], O.shape[2], -1)
+        Z = self.o_linear(O)
+
+        return Z
+
+
+class Linear(nn.Linear):
+    """
+    A Linear layer with built-in nonstandard initializations. Called just
+    like torch.nn.Linear.
+    Implements the initializers in 1.11.4, plus some additional ones found
+    in the code.
+    """
+
+    def __init__(
+        self,
+        feature_in: int,
+        feature_out: int,
+        initializer: str = 'linear',
+        use_bias: bool = True,
+        bias_init: float = 0.,
+    ):
+        super(Linear, self).__init__(feature_in, feature_out, bias=use_bias)
+
+        self.use_bias = use_bias
+        if initializer == 'linear':
+            glorot_uniform_af(self.weight, gain=1.0)
+        elif initializer == 'relu':
+            glorot_uniform_af(self.weight, gain=2.0)
+        elif initializer == 'zeros':
+            nn.init.zeros_(self.weight)
+        if self.use_bias:
+            with torch.no_grad():
+                self.bias.fill_(bias_init)
+
+
+class SelfAttention(nn.Module):
+    """
+    Multi-Head SelfAttention dealing with [batch_size1, batch_size2, len, dim] tensors
+    """
+
+    def __init__(self, qkv_dim, c, n_head, out_dim, gating=True, last_bias_fuse=False):
+        super(SelfAttention, self).__init__()
+        self.qkv_dim = qkv_dim
+        self.c = c
+        self.n_head = n_head
+        self.out_dim = out_dim
+        self.gating = gating
+        self.last_bias_fuse = last_bias_fuse
+
+        self.scaling = self.c**(-0.5)
+
+        # self.to_qkv = Linear(qkv_dim, 3 * n_head * c, initializer='linear')
+        self.to_q = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
+        self.to_k = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
+        self.to_v = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
+
+        if gating:
+            self.gating_bias = nn.parameter.Parameter(data=torch.ones((n_head * c,)))
+            self.gating_linear = Linear(qkv_dim, n_head * c, initializer='zero', use_bias=False)
+
+        self.o_linear = Linear(n_head * c,
+                               out_dim,
+                               initializer='zero',
+                               use_bias=(not last_bias_fuse))
+
+    def forward(self, in_data, nonbatched_bias=None):
+        """
+        :param in_data: [batch_size1, batch_size2, len_qkv, qkv_dim]
+        :param bias: None or [batch_size1, batch_size2, n_head, len_q, len_kv]
+        :param nonbatched_bias: None or [batch_size1, n_head, len_q, len_kv]
+        """
+
+        # qkv = self.to_qkv(in_data).chunk(3, dim=-1)
+        # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head), qkv)
+
+        q = self.to_q(in_data)
+        k = self.to_k(in_data)
+        v = self.to_v(in_data)
+
+        # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head),
+        #               [q, k, v])
+        q, k, v = map(lambda t: t.view(t.shape[0], t.shape[1], t.shape[2], self.n_head, -1).permute(0, 1, 3, 2, 4),
+                      [q, k, v])
+        
+        q = q * self.scaling
+
+        logits = torch.matmul(q, k.transpose(-1, -2))
+
+        if nonbatched_bias is not None:
+            logits += nonbatched_bias.unsqueeze(1)
+        weights = torch.softmax(logits, dim=-1)
+        # weights = softmax(logits)
+
+        weighted_avg = torch.matmul(weights, v)
+        # weighted_avg = rearrange(weighted_avg, 'b1 b2 h n d -> b1 b2 n (h d)')
+        weighted_avg = weighted_avg.permute(0, 1, 3, 2, 4)
+        weighted_avg = weighted_avg.reshape(weighted_avg.shape[0], weighted_avg.shape[1], weighted_avg.shape[2], -1)
+
+        if self.gating:
+            gate_values = self.gating_linear(in_data)
+            weighted_avg = bias_sigmod_ele(gate_values, self.gating_bias, weighted_avg)
+
+        output = self.o_linear(weighted_avg)
+        return output
diff --git a/evoformer_openfold/triangle.py b/evoformer_openfold/triangle.py
new file mode 100644
index 000000000000..f479469c3836
--- /dev/null
+++ b/evoformer_openfold/triangle.py
@@ -0,0 +1,192 @@
+import math
+
+import torch
+import torch.nn as nn
+from torch.nn import LayerNorm
+
+from .kernel import bias_dropout_add, bias_ele_dropout_residual
+from .ops import Linear, SelfAttention, Transition
+
+
+def permute_final_dims(tensor, inds):
+    zero_index = -1 * len(inds)
+    first_inds = list(range(len(tensor.shape[:zero_index])))
+    return tensor.permute(first_inds + [zero_index + i for i in inds])
+
+
+class TriangleMultiplicationOutgoing(nn.Module):
+
+    def __init__(self, d_pair, p_drop, c=128):
+        super(TriangleMultiplicationOutgoing, self).__init__()
+        self.d_pair = d_pair
+        self.c = c
+
+        self.layernorm1 = LayerNorm(d_pair)
+        self.left_projection = Linear(d_pair, c)
+        self.right_projection = Linear(d_pair, c)
+        self.left_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
+        self.right_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
+
+        self.output_gate = Linear(d_pair, d_pair, initializer='zeros', bias_init=1.)
+        self.layernorm2 = LayerNorm(c)
+        self.output_projection = Linear(d_pair, d_pair, initializer='zeros', use_bias=False)
+        self.output_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
+
+        self.p_drop = p_drop
+
+    def forward(self, Z_raw):
+        Z = self.layernorm1(Z_raw)
+        left_proj_act = self.left_projection(Z)
+        right_proj_act = self.right_projection(Z)
+
+        left_proj_act = left_proj_act * torch.sigmoid(self.left_gate(Z))
+        right_proj_act = right_proj_act * torch.sigmoid(self.right_gate(Z))
+
+        g = torch.sigmoid(self.output_gate(Z))
+        # p = torch.matmul(
+        #     permute_final_dims(left_proj_act, (2, 0, 1)),
+        #     permute_final_dims(right_proj_act, (2, 1, 0)),
+        # )
+        # ab = permute_final_dims(p, (1, 2, 0))
+
+        ab = torch.einsum('bikd,bjkd->bijd', left_proj_act, right_proj_act)
+        ab = self.output_projection(self.layernorm2(ab))
+        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
+        return bias_ele_dropout_residual(ab,
+                                         self.output_bias,
+                                         g,
+                                         dropout_mask,
+                                         Z_raw,
+                                         prob=self.p_drop)
+
+
+class TriangleMultiplicationIncoming(nn.Module):
+
+    def __init__(self, d_pair, p_drop, c=128):
+        super(TriangleMultiplicationIncoming, self).__init__()
+        self.d_pair = d_pair
+        self.c = c
+
+        self.layernorm1 = LayerNorm(d_pair)
+        self.left_projection = Linear(d_pair, c)
+        self.right_projection = Linear(d_pair, c)
+        self.left_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
+        self.right_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
+
+        self.output_gate = Linear(d_pair, d_pair, initializer='zeros', bias_init=1.)
+        self.layernorm2 = LayerNorm(c)
+        self.output_projection = Linear(d_pair, d_pair, initializer='zeros', use_bias=False)
+        self.output_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
+
+        self.p_drop = p_drop
+
+    def forward(self, Z_raw):
+        Z = self.layernorm1(Z_raw)
+        left_proj_act = self.left_projection(Z)
+        right_proj_act = self.right_projection(Z)
+
+        left_proj_act = left_proj_act * torch.sigmoid(self.left_gate(Z))
+        right_proj_act = right_proj_act * torch.sigmoid(self.right_gate(Z))
+
+        g = torch.sigmoid(self.output_gate(Z))
+        # p = torch.matmul(
+        #     permute_final_dims(left_proj_act, (2, 1, 0)),
+        #     permute_final_dims(right_proj_act, (2, 0, 1)),
+        # )
+        # ab = permute_final_dims(p, (1, 2, 0))
+
+        ab = torch.einsum('bkid,bkjd->bijd', left_proj_act, right_proj_act)
+        ab = self.output_projection(self.layernorm2(ab))
+        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
+        return bias_ele_dropout_residual(ab,
+                                         self.output_bias,
+                                         g,
+                                         dropout_mask,
+                                         Z_raw,
+                                         prob=self.p_drop)
+
+
+class TriangleAttentionStartingNode(nn.Module):
+
+    def __init__(self, d_pair, p_drop, c=32, n_head=4):
+        super(TriangleAttentionStartingNode, self).__init__()
+        self.d_pair = d_pair
+        self.c = c
+        self.n_head = n_head
+        self.p_drop = p_drop
+
+        self.layernorm1 = LayerNorm(d_pair)
+        _init_weights = torch.nn.init.normal_(torch.zeros([d_pair, n_head]),
+                                              std=1.0 / math.sqrt(d_pair))
+        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights)
+        self.attention = SelfAttention(qkv_dim=d_pair,
+                                       c=c,
+                                       n_head=n_head,
+                                       out_dim=d_pair,
+                                       gating=True,
+                                       last_bias_fuse=True)
+
+        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
+
+    def forward(self, Z_raw):
+        Z = self.layernorm1(Z_raw)
+        b = torch.einsum('bqkc,ch->bhqk', Z, self.linear_b_weights)
+
+        Z = self.attention(Z, b)
+
+        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
+        return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
+
+
+class TriangleAttentionEndingNode(nn.Module):
+
+    def __init__(self, d_pair, p_drop, c=32, n_head=4):
+        super(TriangleAttentionEndingNode, self).__init__()
+        self.d_pair = d_pair
+        self.c = c
+        self.n_head = n_head
+        self.p_drop = p_drop
+
+        self.layernorm1 = LayerNorm(d_pair)
+        _init_weights = torch.nn.init.normal_(torch.zeros([d_pair, n_head]),
+                                              std=1.0 / math.sqrt(d_pair))
+        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights)
+        self.attention = SelfAttention(qkv_dim=d_pair,
+                                       c=c,
+                                       n_head=n_head,
+                                       out_dim=d_pair,
+                                       gating=True,
+                                       last_bias_fuse=True)
+
+        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
+
+    def forward(self, Z_raw):
+        Z = Z_raw.transpose(-2, -3)
+        Z = self.layernorm1(Z)
+        b = torch.einsum('bqkc,ch->bhqk', Z, self.linear_b_weights)
+
+        Z = self.attention(Z, b)
+
+        Z = Z.transpose(-2, -3)
+        dropout_mask = torch.ones_like(Z[:, :, 0:1, :]).to(Z.device).to(Z.dtype)
+        return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
+
+
+class PairStack(nn.Module):
+
+    def __init__(self, d_pair, p_drop=0.25):
+        super(PairStack, self).__init__()
+
+        self.TriangleMultiplicationOutgoing = TriangleMultiplicationOutgoing(d_pair, p_drop=p_drop)
+        self.TriangleMultiplicationIncoming = TriangleMultiplicationIncoming(d_pair, p_drop=p_drop)
+        self.TriangleAttentionStartingNode = TriangleAttentionStartingNode(d_pair, p_drop=p_drop)
+        self.TriangleAttentionEndingNode = TriangleAttentionEndingNode(d_pair, p_drop=p_drop)
+        self.PairTransition = Transition(d=d_pair)
+
+    def forward(self, pair):
+        pair = self.TriangleMultiplicationOutgoing(pair)
+        pair = self.TriangleMultiplicationIncoming(pair)
+        pair = self.TriangleAttentionStartingNode(pair)
+        pair = self.TriangleAttentionEndingNode(pair)
+        pair = self.PairTransition(pair)
+        return pair

From 5c4df01af3076069867a66c5fc7a8086e6c55c0a Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 29 Dec 2022 15:54:08 +0800
Subject: [PATCH 071/209] update openfold

---
 openfold/evoformer.py | 29 ++++++-------------
 openfold/msa.py       | 67 ++-----------------------------------------
 2 files changed, 12 insertions(+), 84 deletions(-)

diff --git a/openfold/evoformer.py b/openfold/evoformer.py
index 21e422b04764..7fbcd8a76b4d 100644
--- a/openfold/evoformer.py
+++ b/openfold/evoformer.py
@@ -182,33 +182,28 @@ def forward(
         self,
         m: torch.Tensor,
         z: torch.Tensor,
-        msa_mask: torch.Tensor,
-        pair_mask: torch.Tensor,
         chunk_size: Optional[int] = None,
-        _mask_trans: bool = True,
     ) -> Tuple[torch.Tensor, torch.Tensor]: 
         # DeepMind doesn't mask these transitions in the source, so _mask_trans
         # should be disabled to better approximate the exact activations of
         # the original.
-        msa_trans_mask = msa_mask if _mask_trans else None
-        pair_trans_mask = pair_mask if _mask_trans else None
 
         m = m + self.msa_transition(
-            m, mask=msa_trans_mask, chunk_size=chunk_size
+            m, chunk_size=chunk_size
         )
         z = z + self.outer_product_mean(
-            m, mask=msa_mask, chunk_size=chunk_size
+            m, chunk_size=chunk_size
         )
-        z = z + self.ps_dropout_row_layer(self.tri_mul_out(z, mask=pair_mask))
-        z = z + self.ps_dropout_row_layer(self.tri_mul_in(z, mask=pair_mask))
+        z = z + self.ps_dropout_row_layer(self.tri_mul_out(z))
+        z = z + self.ps_dropout_row_layer(self.tri_mul_in(z))
         z = z + self.ps_dropout_row_layer(
-            self.tri_att_start(z, mask=pair_mask, chunk_size=chunk_size)
+            self.tri_att_start(z, chunk_size=chunk_size)
         )
         z = z + self.ps_dropout_col_layer(
-            self.tri_att_end(z, mask=pair_mask, chunk_size=chunk_size)
+            self.tri_att_end(z, chunk_size=chunk_size)
         )
         z = z + self.pair_transition(
-            z, mask=pair_trans_mask, chunk_size=chunk_size
+            z, chunk_size=chunk_size
         )
 
         return m, z
@@ -274,22 +269,16 @@ def __init__(self,
     def forward(self,
         m: torch.Tensor,
         z: torch.Tensor,
-        msa_mask: torch.Tensor,
-        pair_mask: torch.Tensor,
         chunk_size: Optional[int] = None,
-        _mask_trans: bool = True,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         m = m + self.msa_dropout_layer(
-            self.msa_att_row(m, z=z, mask=msa_mask, chunk_size=chunk_size)
+            self.msa_att_row(m, z=z, chunk_size=chunk_size)
         )
-        m = m + self.msa_att_col(m, mask=msa_mask, chunk_size=chunk_size)
+        m = m + self.msa_att_col(m, chunk_size=chunk_size)
         m, z = self.core(
             m, 
             z, 
-            msa_mask=msa_mask, 
-            pair_mask=pair_mask, 
             chunk_size=chunk_size, 
-            _mask_trans=_mask_trans,
         )
 
         return m, z
diff --git a/openfold/msa.py b/openfold/msa.py
index 172b26def5f1..00b822e7f390 100644
--- a/openfold/msa.py
+++ b/openfold/msa.py
@@ -136,45 +136,6 @@ def _prep_inputs(self,
 
         return m, mask_bias, z
 
-    @torch.jit.ignore
-    def _chunked_msa_attn(self,
-        m: torch.Tensor,
-        z: Optional[torch.Tensor],
-        mask: Optional[torch.Tensor],
-        chunk_logits: int,
-        checkpoint: bool,
-    ) -> torch.Tensor:
-        MSA_DIM = -4
-
-        def _get_qkv(m, z):
-            m, mask_bias, z = self._prep_inputs(m, z, mask)
-            q, k, v = self.mha._prep_qkv(m, m)
-            return m, q, k, v, mask_bias, z
-
-        checkpoint_fn = get_checkpoint_fn()
-
-        if(torch.is_grad_enabled() and checkpoint):
-            m, q, k, v, mask_bias, z = checkpoint_fn(_get_qkv, m, z)
-        else:
-            m, q, k, v, mask_bias, z = _get_qkv(m, z)
-       
-        o = _attention_chunked_trainable(
-            query=q, 
-            key=k, 
-            value=v, 
-            biases=[mask_bias, z], 
-            chunk_size=chunk_logits, 
-            chunk_dim=MSA_DIM,
-            checkpoint=checkpoint,
-        )
-
-        if(torch.is_grad_enabled() and checkpoint):
-            # Storing an additional m here is far from ideal
-            m = checkpoint_fn(self.mha._wrap_up, o, m)
-        else:
-            m = self.mha._wrap_up(o, m)
-
-        return m
 
     def forward(self, 
         m: torch.Tensor, 
@@ -199,12 +160,6 @@ def forward(self,
                 cost of slower execution. Chunking is not performed by default.
                 
         """
-        if(_chunk_logits is not None):
-            return self._chunked_msa_attn(
-                m=m, z=z, mask=mask, 
-                chunk_logits=_chunk_logits, checkpoint=_checkpoint_chunks
-            )           
-
         m, mask_bias, z = self._prep_inputs(m, z, mask)
 
         biases = [mask_bias]
@@ -306,15 +261,11 @@ def forward(self,
         """ 
         # [*, N_res, N_seq, C_in]
         m = m.transpose(-2, -3)
-        if mask is not None:
-            mask = mask.transpose(-1, -2)
 
-        m = self._msa_att(m, mask=mask, chunk_size=chunk_size)
+        m = self._msa_att(m, chunk_size=chunk_size)
 
         # [*, N_seq, N_res, C_in]
         m = m.transpose(-2, -3)
-        if mask is not None:
-            mask = mask.transpose(-1, -2)
 
         return m
 
@@ -344,12 +295,10 @@ def __init__(
     @torch.jit.ignore
     def _chunk(self,
         m: torch.Tensor,
-        mask: torch.Tensor,
         chunk_size: int,
     ) -> torch.Tensor:
         mha_input = {
             "m": m,
-            "mask": mask,
         }
         return chunk_layer(
             self.global_attention,
@@ -361,30 +310,20 @@ def _chunk(self,
     def forward(
         self, 
         m: torch.Tensor, 
-        mask: Optional[torch.Tensor] = None, 
         chunk_size: Optional[int] = None,
     ) -> torch.Tensor:
         n_seq, n_res, c_in = m.shape[-3:]
 
-        if mask is None:
-            # [*, N_seq, N_res]
-            mask = torch.ones(
-                m.shape[:-1],
-                dtype=m.dtype,
-                device=m.device,
-            ).detach()
-
         # [*, N_res, N_seq, C_in]
         m = m.transpose(-2, -3)
-        mask = mask.transpose(-1, -2)
 
         # [*, N_res, N_seq, C_in]
         m = self.layer_norm_m(m)
 
         if chunk_size is not None:
-            m = self._chunk(m, mask, chunk_size) 
+            m = self._chunk(m, chunk_size) 
         else:
-            m = self.global_attention(m=m, mask=mask)
+            m = self.global_attention(m=m)
 
         # [*, N_seq, N_res, C_in]
         m = m.transpose(-2, -3)

From f7d8092c84eef1a5dfd976f883a6d38d5b11bd68 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 29 Dec 2022 16:01:05 +0800
Subject: [PATCH 072/209] align openfold

---
 autochunk_benchmark.py            |  41 ++++++-
 evoformer_openfold/evoformer.py   |  59 ---------
 evoformer_openfold/initializer.py |  29 -----
 evoformer_openfold/kernel.py      |  19 ---
 evoformer_openfold/msa.py         |  95 ---------------
 evoformer_openfold/ops.py         | 176 ---------------------------
 evoformer_openfold/triangle.py    | 192 -----------------------------
 openfold/evoformer.py             | 194 ------------------------------
 8 files changed, 36 insertions(+), 769 deletions(-)
 delete mode 100644 evoformer_openfold/evoformer.py
 delete mode 100755 evoformer_openfold/initializer.py
 delete mode 100644 evoformer_openfold/kernel.py
 delete mode 100644 evoformer_openfold/msa.py
 delete mode 100755 evoformer_openfold/ops.py
 delete mode 100644 evoformer_openfold/triangle.py

diff --git a/autochunk_benchmark.py b/autochunk_benchmark.py
index 20f615b216f7..679016438c59 100644
--- a/autochunk_benchmark.py
+++ b/autochunk_benchmark.py
@@ -9,20 +9,27 @@
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 from colossalai.fx.profiler import MetaTensor
 from evoformer.evoformer import evoformer_base
+from openfold.evoformer import EvoformerBlock
 
 
-def _benchmark_evoformer(model: torch.nn.Module, node, pair, title):
+def _benchmark_evoformer(model: torch.nn.Module, node, pair, title, chunk_size=None):
     torch.cuda.reset_peak_memory_stats()
     now_mem = torch.cuda.memory_allocated() / 1024**2
 
     loop = 16
     with torch.no_grad():
         for _ in range(loop // 4):
-            model(node, pair)
+            if chunk_size:
+                model(node, pair, chunk_size)
+            else:
+                model(node, pair)
         torch.cuda.synchronize()
         time1 = time.time()
         for _ in range(loop):
-            model(node, pair)
+            if chunk_size:
+                model(node, pair, chunk_size)
+            else:
+                model(node, pair)
         torch.cuda.synchronize()
         time2 = time.time()
 
@@ -64,6 +71,26 @@ def _build_autochunk(model, max_memory, node, pair):
     return gm
 
 
+def _build_openfold():
+    model = EvoformerBlock(
+        c_m=256,
+        c_z=128,
+        c_hidden_msa_att=32,
+        c_hidden_opm=32,
+        c_hidden_mul=128,
+        c_hidden_pair_att=32,
+        no_heads_msa=8,
+        no_heads_pair=4,
+        transition_n=4,
+        msa_dropout=0.15,
+        pair_dropout=0.15,
+        inf=1e4,
+        eps=1e-4,
+        is_multimer=False,
+    ).cuda()
+    return model
+
+
 def benchmark_evoformer():
     # init data and model
     msa_len = 300
@@ -74,10 +101,14 @@ def benchmark_evoformer():
 
     # build autochunk model
     max_memory = 3000  # MB
-    autochunk = _build_autochunk(model, max_memory, node, pair)
+    autochunk = _build_autochunk(evoformer_base().cuda(), max_memory, node, pair)
+
+    # build openfold
+    openfold = _build_openfold()
 
     # benchmark
-    _benchmark_evoformer(model, node, pair, "openfold")
+    _benchmark_evoformer(model, node, pair, "base")
+    _benchmark_evoformer(openfold, node, pair, "openfold", chunk_size=4)
     _benchmark_evoformer(autochunk, node, pair, "autochunk")
 
 
diff --git a/evoformer_openfold/evoformer.py b/evoformer_openfold/evoformer.py
deleted file mode 100644
index cfd2bb2a2529..000000000000
--- a/evoformer_openfold/evoformer.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import torch
-import torch.nn as nn
-
-from .msa import MSAStack
-from .ops import OutProductMean
-from .triangle import PairStack
-
-
-def print_memory(init_mem, text=None):
-    now_mem = torch.cuda.memory_allocated() / 1024 ** 2 - init_mem
-    max_mem = torch.cuda.max_memory_allocated() / 1024 ** 2 - init_mem
-    print("%s now:%.2f max:%.2f" % ("" if text is None else text, now_mem, max_mem))
-    torch.cuda.reset_peak_memory_stats()
-
-
-class EvoformerBlock(nn.Module):
-
-    def __init__(self, d_node, d_pair):
-        super(EvoformerBlock, self).__init__()
-
-        self.msa_stack = MSAStack(d_node, d_pair, p_drop=0.15)
-        self.communication = OutProductMean(n_feat=d_node, n_feat_out=d_pair, n_feat_proj=32)
-        self.pair_stack = PairStack(d_pair=d_pair)
-
-    def forward(self, node, pair):
-        node = self.msa_stack(node, pair)
-        pair = pair + self.communication(node)
-        pair = self.pair_stack(pair)
-        return node, pair
-
-
-class Evoformer(nn.Module):
-
-    def __init__(self, d_node, d_pair):
-        super(Evoformer, self).__init__()
-
-        self.blocks = nn.ModuleList()
-        for _ in range(1):
-            self.blocks.append(EvoformerBlock(d_node, d_pair))
-
-    def forward(self, node, pair):
-        for b in self.blocks:
-            node, pair = b(node, pair)
-        return node, pair
-
-
-def evoformer_tiny():
-    return Evoformer(d_node=64, d_pair=32)
-
-
-def evoformer_base():
-    return Evoformer(d_node=256, d_pair=128)
-
-
-def evoformer_large():
-    return Evoformer(d_node=512, d_pair=256)
-
-
-__all__ = ['Evoformer', 'evoformer_base', 'evoformer_large']
diff --git a/evoformer_openfold/initializer.py b/evoformer_openfold/initializer.py
deleted file mode 100755
index c6ce0659e597..000000000000
--- a/evoformer_openfold/initializer.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import math
-
-import numpy as np
-import torch.nn as nn
-
-
-def glorot_uniform_af(x, gain=1.0):
-    """
-    initialize tensors the same as xavier_initializer in PyTorch, but the dimensions are different:
-    In PyTorch:
-    [feature_out, feature_in, n_head ...]
-    In Jax:
-    [... n_head, feature_in, feature_out]
-    However, there is a feature in original Alphafold2 code that they use the Jax version initializer to initialize tensors like:
-    [feature_in, n_head, feature_out]
-
-    In this function, we keep this feature to initialize [feature_in, n_head, ..., feature_out] tensors
-    """
-    fan_in, fan_out = x.shape[-2:]
-    if len(x.shape) > 2:
-        receptive_field_size = np.prod(x.shape[:-2])
-        fan_in *= receptive_field_size
-        fan_out *= receptive_field_size
-    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
-    dev = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
-
-    nn.init.uniform_(x, -dev, dev)
-
-    return x
diff --git a/evoformer_openfold/kernel.py b/evoformer_openfold/kernel.py
deleted file mode 100644
index 26ab5dc53261..000000000000
--- a/evoformer_openfold/kernel.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import torch
-import torch.nn.functional as F
-
-
-def bias_sigmod_ele(y, bias, z):
-    return torch.sigmoid(y + bias) * z
-
-
-def bias_dropout_add(x: torch.Tensor, bias: torch.Tensor, dropmask: torch.Tensor,
-                     residual: torch.Tensor, prob: float) -> torch.Tensor:
-    out = (x + bias) * F.dropout(dropmask, p=prob, training=False)
-    out = residual + out
-    return out
-
-
-def bias_ele_dropout_residual(ab: torch.Tensor, b: torch.Tensor, g: torch.Tensor,
-                              dropout_mask: torch.Tensor, Z_raw: torch.Tensor,
-                              prob: float) -> torch.Tensor:
-    return Z_raw + F.dropout(dropout_mask, p=prob, training=True) * (g * (ab + b))
\ No newline at end of file
diff --git a/evoformer_openfold/msa.py b/evoformer_openfold/msa.py
deleted file mode 100644
index cac456638a55..000000000000
--- a/evoformer_openfold/msa.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import math
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from torch.nn import LayerNorm
-
-from .kernel import bias_dropout_add
-from .ops import SelfAttention, Transition
-
-
-class MSARowAttentionWithPairBias(nn.Module):
-
-    def __init__(self, d_node, d_pair, c=32, n_head=8, p_drop=0.15):
-        super(MSARowAttentionWithPairBias, self).__init__()
-        self.d_node = d_node
-        self.d_pair = d_pair
-        self.c = c
-        self.n_head = n_head
-        self.p_drop = p_drop
-
-        self.layernormM = LayerNorm(d_node)
-        self.layernormZ = LayerNorm(d_pair)
-
-        _init_weights = torch.nn.init.normal_(torch.zeros([n_head, d_pair]),
-                                              std=1.0 / math.sqrt(d_pair))
-        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights, requires_grad=True)
-
-        self.attention = SelfAttention(qkv_dim=d_node,
-                                       c=c,
-                                       n_head=n_head,
-                                       out_dim=d_node,
-                                       gating=True,
-                                       last_bias_fuse=True)
-
-        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_node,)), requires_grad=True)
-
-    def forward(self, M_raw, Z):
-        ## Input projections
-        M = self.layernormM(M_raw)
-        Z = self.layernormZ(Z)
-        b = F.linear(Z, self.linear_b_weights)
-        b = b.permute(0, 3, 1, 2)
-        # b = rearrange(b, 'b q k h -> b h q k')
-
-        M = self.attention(M, b)
-        dropout_mask = torch.ones_like(M[:, 0:1, :, :]).to(M.device).to(M.dtype)
-
-        return bias_dropout_add(M, self.out_bias, dropout_mask, M_raw, prob=self.p_drop)
-
-
-class MSAColumnAttention(nn.Module):
-
-    def __init__(self, d_node, c=32, n_head=8):
-        super(MSAColumnAttention, self).__init__()
-        self.d_node = d_node
-        self.c = c
-        self.n_head = n_head
-
-        self.layernormM = LayerNorm(d_node)
-        self.attention = SelfAttention(qkv_dim=d_node,
-                                       c=c,
-                                       n_head=n_head,
-                                       out_dim=d_node,
-                                       gating=True)
-
-    def forward(self, M_raw):
-        M = M_raw.transpose(-2, -3)
-        M = self.layernormM(M)
-
-        M = self.attention(M)
-
-        M = M.transpose(-2, -3)
-        return M_raw + M
-
-
-class MSAStack(nn.Module):
-
-    def __init__(self, d_node, d_pair, p_drop=0.15):
-        super(MSAStack, self).__init__()
-
-        self.MSARowAttentionWithPairBias = MSARowAttentionWithPairBias(d_node=d_node,
-                                                                       d_pair=d_pair,
-                                                                       p_drop=p_drop)
-
-        self.MSAColumnAttention = MSAColumnAttention(d_node=d_node)
-        self.MSATransition = Transition(d=d_node)
-
-    def forward(self, node, pair):
-        node = self.MSARowAttentionWithPairBias(node, pair)
-        node = self.MSAColumnAttention(node)
-        node = self.MSATransition(node)
-
-        return node
diff --git a/evoformer_openfold/ops.py b/evoformer_openfold/ops.py
deleted file mode 100755
index 611b7b0fe777..000000000000
--- a/evoformer_openfold/ops.py
+++ /dev/null
@@ -1,176 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from torch.nn import LayerNorm
-
-from .initializer import glorot_uniform_af
-from .kernel import bias_sigmod_ele
-
-
-class DropoutRowwise(nn.Module):
-
-    def __init__(self, p):
-        super(DropoutRowwise, self).__init__()
-        self.p = p
-        self.dropout = nn.Dropout(p=p)
-
-    def forward(self, x):
-        dropout_mask = torch.ones_like(x[:, 0:1, :, :])
-        dropout_mask = self.dropout(dropout_mask)
-        return dropout_mask * x
-
-
-class DropoutColumnwise(nn.Module):
-
-    def __init__(self, p):
-        super(DropoutColumnwise, self).__init__()
-        self.p = p
-        self.dropout = nn.Dropout(p=p)
-
-    def forward(self, x):
-        dropout_mask = torch.ones_like(x[:, :, 0:1, :])
-        dropout_mask = self.dropout(dropout_mask)
-        return dropout_mask * x
-
-
-class Transition(nn.Module):
-
-    def __init__(self, d, n=4):
-        super(Transition, self).__init__()
-        self.norm = LayerNorm(d)
-        self.linear1 = Linear(d, n * d, initializer='relu')
-        self.linear2 = Linear(n * d, d, initializer='zeros')
-
-    def forward(self, src):
-        x = self.norm(src)
-        x = self.linear2(F.relu(self.linear1(x)))
-        return src + x
-
-
-class OutProductMean(nn.Module):
-
-    def __init__(self, n_feat=64, n_feat_out=128, n_feat_proj=32):
-        super(OutProductMean, self).__init__()
-
-        self.layernormM = LayerNorm(n_feat)
-        self.linear_a = Linear(n_feat, n_feat_proj)
-        self.linear_b = Linear(n_feat, n_feat_proj)
-
-        self.o_linear = Linear(n_feat_proj * n_feat_proj,
-                               n_feat_out,
-                               initializer='zero',
-                               use_bias=True)
-
-    def forward(self, M):
-        M = self.layernormM(M)
-        left_act = self.linear_a(M)
-        right_act = self.linear_b(M)
-
-        O = torch.einsum('bsid,bsje->bijde', left_act, right_act).contiguous()
-        # O = rearrange(O, 'b i j d e -> b i j (d e)')
-        O = O.reshape(O.shape[0], O.shape[1], O.shape[2], -1)
-        Z = self.o_linear(O)
-
-        return Z
-
-
-class Linear(nn.Linear):
-    """
-    A Linear layer with built-in nonstandard initializations. Called just
-    like torch.nn.Linear.
-    Implements the initializers in 1.11.4, plus some additional ones found
-    in the code.
-    """
-
-    def __init__(
-        self,
-        feature_in: int,
-        feature_out: int,
-        initializer: str = 'linear',
-        use_bias: bool = True,
-        bias_init: float = 0.,
-    ):
-        super(Linear, self).__init__(feature_in, feature_out, bias=use_bias)
-
-        self.use_bias = use_bias
-        if initializer == 'linear':
-            glorot_uniform_af(self.weight, gain=1.0)
-        elif initializer == 'relu':
-            glorot_uniform_af(self.weight, gain=2.0)
-        elif initializer == 'zeros':
-            nn.init.zeros_(self.weight)
-        if self.use_bias:
-            with torch.no_grad():
-                self.bias.fill_(bias_init)
-
-
-class SelfAttention(nn.Module):
-    """
-    Multi-Head SelfAttention dealing with [batch_size1, batch_size2, len, dim] tensors
-    """
-
-    def __init__(self, qkv_dim, c, n_head, out_dim, gating=True, last_bias_fuse=False):
-        super(SelfAttention, self).__init__()
-        self.qkv_dim = qkv_dim
-        self.c = c
-        self.n_head = n_head
-        self.out_dim = out_dim
-        self.gating = gating
-        self.last_bias_fuse = last_bias_fuse
-
-        self.scaling = self.c**(-0.5)
-
-        # self.to_qkv = Linear(qkv_dim, 3 * n_head * c, initializer='linear')
-        self.to_q = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
-        self.to_k = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
-        self.to_v = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
-
-        if gating:
-            self.gating_bias = nn.parameter.Parameter(data=torch.ones((n_head * c,)))
-            self.gating_linear = Linear(qkv_dim, n_head * c, initializer='zero', use_bias=False)
-
-        self.o_linear = Linear(n_head * c,
-                               out_dim,
-                               initializer='zero',
-                               use_bias=(not last_bias_fuse))
-
-    def forward(self, in_data, nonbatched_bias=None):
-        """
-        :param in_data: [batch_size1, batch_size2, len_qkv, qkv_dim]
-        :param bias: None or [batch_size1, batch_size2, n_head, len_q, len_kv]
-        :param nonbatched_bias: None or [batch_size1, n_head, len_q, len_kv]
-        """
-
-        # qkv = self.to_qkv(in_data).chunk(3, dim=-1)
-        # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head), qkv)
-
-        q = self.to_q(in_data)
-        k = self.to_k(in_data)
-        v = self.to_v(in_data)
-
-        # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head),
-        #               [q, k, v])
-        q, k, v = map(lambda t: t.view(t.shape[0], t.shape[1], t.shape[2], self.n_head, -1).permute(0, 1, 3, 2, 4),
-                      [q, k, v])
-        
-        q = q * self.scaling
-
-        logits = torch.matmul(q, k.transpose(-1, -2))
-
-        if nonbatched_bias is not None:
-            logits += nonbatched_bias.unsqueeze(1)
-        weights = torch.softmax(logits, dim=-1)
-        # weights = softmax(logits)
-
-        weighted_avg = torch.matmul(weights, v)
-        # weighted_avg = rearrange(weighted_avg, 'b1 b2 h n d -> b1 b2 n (h d)')
-        weighted_avg = weighted_avg.permute(0, 1, 3, 2, 4)
-        weighted_avg = weighted_avg.reshape(weighted_avg.shape[0], weighted_avg.shape[1], weighted_avg.shape[2], -1)
-
-        if self.gating:
-            gate_values = self.gating_linear(in_data)
-            weighted_avg = bias_sigmod_ele(gate_values, self.gating_bias, weighted_avg)
-
-        output = self.o_linear(weighted_avg)
-        return output
diff --git a/evoformer_openfold/triangle.py b/evoformer_openfold/triangle.py
deleted file mode 100644
index f479469c3836..000000000000
--- a/evoformer_openfold/triangle.py
+++ /dev/null
@@ -1,192 +0,0 @@
-import math
-
-import torch
-import torch.nn as nn
-from torch.nn import LayerNorm
-
-from .kernel import bias_dropout_add, bias_ele_dropout_residual
-from .ops import Linear, SelfAttention, Transition
-
-
-def permute_final_dims(tensor, inds):
-    zero_index = -1 * len(inds)
-    first_inds = list(range(len(tensor.shape[:zero_index])))
-    return tensor.permute(first_inds + [zero_index + i for i in inds])
-
-
-class TriangleMultiplicationOutgoing(nn.Module):
-
-    def __init__(self, d_pair, p_drop, c=128):
-        super(TriangleMultiplicationOutgoing, self).__init__()
-        self.d_pair = d_pair
-        self.c = c
-
-        self.layernorm1 = LayerNorm(d_pair)
-        self.left_projection = Linear(d_pair, c)
-        self.right_projection = Linear(d_pair, c)
-        self.left_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
-        self.right_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
-
-        self.output_gate = Linear(d_pair, d_pair, initializer='zeros', bias_init=1.)
-        self.layernorm2 = LayerNorm(c)
-        self.output_projection = Linear(d_pair, d_pair, initializer='zeros', use_bias=False)
-        self.output_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
-
-        self.p_drop = p_drop
-
-    def forward(self, Z_raw):
-        Z = self.layernorm1(Z_raw)
-        left_proj_act = self.left_projection(Z)
-        right_proj_act = self.right_projection(Z)
-
-        left_proj_act = left_proj_act * torch.sigmoid(self.left_gate(Z))
-        right_proj_act = right_proj_act * torch.sigmoid(self.right_gate(Z))
-
-        g = torch.sigmoid(self.output_gate(Z))
-        # p = torch.matmul(
-        #     permute_final_dims(left_proj_act, (2, 0, 1)),
-        #     permute_final_dims(right_proj_act, (2, 1, 0)),
-        # )
-        # ab = permute_final_dims(p, (1, 2, 0))
-
-        ab = torch.einsum('bikd,bjkd->bijd', left_proj_act, right_proj_act)
-        ab = self.output_projection(self.layernorm2(ab))
-        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
-        return bias_ele_dropout_residual(ab,
-                                         self.output_bias,
-                                         g,
-                                         dropout_mask,
-                                         Z_raw,
-                                         prob=self.p_drop)
-
-
-class TriangleMultiplicationIncoming(nn.Module):
-
-    def __init__(self, d_pair, p_drop, c=128):
-        super(TriangleMultiplicationIncoming, self).__init__()
-        self.d_pair = d_pair
-        self.c = c
-
-        self.layernorm1 = LayerNorm(d_pair)
-        self.left_projection = Linear(d_pair, c)
-        self.right_projection = Linear(d_pair, c)
-        self.left_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
-        self.right_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
-
-        self.output_gate = Linear(d_pair, d_pair, initializer='zeros', bias_init=1.)
-        self.layernorm2 = LayerNorm(c)
-        self.output_projection = Linear(d_pair, d_pair, initializer='zeros', use_bias=False)
-        self.output_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
-
-        self.p_drop = p_drop
-
-    def forward(self, Z_raw):
-        Z = self.layernorm1(Z_raw)
-        left_proj_act = self.left_projection(Z)
-        right_proj_act = self.right_projection(Z)
-
-        left_proj_act = left_proj_act * torch.sigmoid(self.left_gate(Z))
-        right_proj_act = right_proj_act * torch.sigmoid(self.right_gate(Z))
-
-        g = torch.sigmoid(self.output_gate(Z))
-        # p = torch.matmul(
-        #     permute_final_dims(left_proj_act, (2, 1, 0)),
-        #     permute_final_dims(right_proj_act, (2, 0, 1)),
-        # )
-        # ab = permute_final_dims(p, (1, 2, 0))
-
-        ab = torch.einsum('bkid,bkjd->bijd', left_proj_act, right_proj_act)
-        ab = self.output_projection(self.layernorm2(ab))
-        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
-        return bias_ele_dropout_residual(ab,
-                                         self.output_bias,
-                                         g,
-                                         dropout_mask,
-                                         Z_raw,
-                                         prob=self.p_drop)
-
-
-class TriangleAttentionStartingNode(nn.Module):
-
-    def __init__(self, d_pair, p_drop, c=32, n_head=4):
-        super(TriangleAttentionStartingNode, self).__init__()
-        self.d_pair = d_pair
-        self.c = c
-        self.n_head = n_head
-        self.p_drop = p_drop
-
-        self.layernorm1 = LayerNorm(d_pair)
-        _init_weights = torch.nn.init.normal_(torch.zeros([d_pair, n_head]),
-                                              std=1.0 / math.sqrt(d_pair))
-        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights)
-        self.attention = SelfAttention(qkv_dim=d_pair,
-                                       c=c,
-                                       n_head=n_head,
-                                       out_dim=d_pair,
-                                       gating=True,
-                                       last_bias_fuse=True)
-
-        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
-
-    def forward(self, Z_raw):
-        Z = self.layernorm1(Z_raw)
-        b = torch.einsum('bqkc,ch->bhqk', Z, self.linear_b_weights)
-
-        Z = self.attention(Z, b)
-
-        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
-        return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
-
-
-class TriangleAttentionEndingNode(nn.Module):
-
-    def __init__(self, d_pair, p_drop, c=32, n_head=4):
-        super(TriangleAttentionEndingNode, self).__init__()
-        self.d_pair = d_pair
-        self.c = c
-        self.n_head = n_head
-        self.p_drop = p_drop
-
-        self.layernorm1 = LayerNorm(d_pair)
-        _init_weights = torch.nn.init.normal_(torch.zeros([d_pair, n_head]),
-                                              std=1.0 / math.sqrt(d_pair))
-        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights)
-        self.attention = SelfAttention(qkv_dim=d_pair,
-                                       c=c,
-                                       n_head=n_head,
-                                       out_dim=d_pair,
-                                       gating=True,
-                                       last_bias_fuse=True)
-
-        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
-
-    def forward(self, Z_raw):
-        Z = Z_raw.transpose(-2, -3)
-        Z = self.layernorm1(Z)
-        b = torch.einsum('bqkc,ch->bhqk', Z, self.linear_b_weights)
-
-        Z = self.attention(Z, b)
-
-        Z = Z.transpose(-2, -3)
-        dropout_mask = torch.ones_like(Z[:, :, 0:1, :]).to(Z.device).to(Z.dtype)
-        return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
-
-
-class PairStack(nn.Module):
-
-    def __init__(self, d_pair, p_drop=0.25):
-        super(PairStack, self).__init__()
-
-        self.TriangleMultiplicationOutgoing = TriangleMultiplicationOutgoing(d_pair, p_drop=p_drop)
-        self.TriangleMultiplicationIncoming = TriangleMultiplicationIncoming(d_pair, p_drop=p_drop)
-        self.TriangleAttentionStartingNode = TriangleAttentionStartingNode(d_pair, p_drop=p_drop)
-        self.TriangleAttentionEndingNode = TriangleAttentionEndingNode(d_pair, p_drop=p_drop)
-        self.PairTransition = Transition(d=d_pair)
-
-    def forward(self, pair):
-        pair = self.TriangleMultiplicationOutgoing(pair)
-        pair = self.TriangleMultiplicationIncoming(pair)
-        pair = self.TriangleAttentionStartingNode(pair)
-        pair = self.TriangleAttentionEndingNode(pair)
-        pair = self.PairTransition(pair)
-        return pair
diff --git a/openfold/evoformer.py b/openfold/evoformer.py
index 7fbcd8a76b4d..ffd4c982987a 100644
--- a/openfold/evoformer.py
+++ b/openfold/evoformer.py
@@ -284,104 +284,6 @@ def forward(self,
         return m, z
 
 
-class ExtraMSABlock(nn.Module):
-    """ 
-        Almost identical to the standard EvoformerBlock, except in that the
-        ExtraMSABlock uses GlobalAttention for MSA column attention and
-        requires more fine-grained control over checkpointing. Separated from
-        its twin to preserve the TorchScript-ability of the latter.
-    """
-    def __init__(self,
-        c_m: int,
-        c_z: int,
-        c_hidden_msa_att: int,
-        c_hidden_opm: int,
-        c_hidden_mul: int,
-        c_hidden_pair_att: int,
-        no_heads_msa: int,
-        no_heads_pair: int,
-        transition_n: int,
-        msa_dropout: float,
-        pair_dropout: float,
-        inf: float,
-        eps: float,
-        ckpt: bool,
-        is_multimer: bool,
-    ):
-        super(ExtraMSABlock, self).__init__()
-        
-        self.ckpt = ckpt
-
-        self.msa_att_row = MSARowAttentionWithPairBias(
-            c_m=c_m,
-            c_z=c_z,
-            c_hidden=c_hidden_msa_att,
-            no_heads=no_heads_msa,
-            inf=inf,
-        )
-
-        self.msa_att_col = MSAColumnGlobalAttention(
-            c_in=c_m,
-            c_hidden=c_hidden_msa_att,
-            no_heads=no_heads_msa,
-            inf=inf,
-            eps=eps,
-        )
-
-        self.msa_dropout_layer = DropoutRowwise(msa_dropout)
-
-        self.core = EvoformerBlockCore(
-            c_m=c_m,
-            c_z=c_z,
-            c_hidden_opm=c_hidden_opm,
-            c_hidden_mul=c_hidden_mul,
-            c_hidden_pair_att=c_hidden_pair_att,
-            no_heads_msa=no_heads_msa,
-            no_heads_pair=no_heads_pair,
-            transition_n=transition_n,
-            pair_dropout=pair_dropout,
-            inf=inf,
-            eps=eps,
-        )
-        self.is_multimer = is_multimer
-
-    def forward(self,
-        m: torch.Tensor,
-        z: torch.Tensor,
-        msa_mask: torch.Tensor,
-        pair_mask: torch.Tensor,
-        chunk_size: Optional[int] = None,
-        _chunk_logits: Optional[int] = 1024,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        m = m + self.msa_dropout_layer(
-            self.msa_att_row(
-                m.clone(), 
-                z=z.clone(), 
-                mask=msa_mask, 
-                chunk_size=chunk_size,
-                _chunk_logits=_chunk_logits if torch.is_grad_enabled() else None,
-                _checkpoint_chunks=
-                    self.ckpt if torch.is_grad_enabled() else False,
-            )
-        )
-
-        def fn(m, z):
-            m = m + self.msa_att_col(m, mask=msa_mask, chunk_size=chunk_size)
-            m, z = self.core(
-                m, z, msa_mask=msa_mask, pair_mask=pair_mask, chunk_size=chunk_size
-            )
-            
-            return m, z
-
-        if(torch.is_grad_enabled() and self.ckpt):
-            checkpoint_fn = get_checkpoint_fn()
-            m, z = checkpoint_fn(fn, m, z)
-        else:
-            m, z = fn(m, z)
-
-        return m, z
-
-
 class EvoformerStack(nn.Module):
     """
     Main Evoformer trunk.
@@ -527,99 +429,3 @@ def block_with_cache_clear(block, *args):
         s = self.linear(m[..., 0, :, :])
         
         return m, z, s
-
-
-class ExtraMSAStack(nn.Module):
-    """
-    Implements Algorithm 18.
-    """
-
-    def __init__(self,
-        c_m: int,
-        c_z: int,
-        c_hidden_msa_att: int,
-        c_hidden_opm: int,
-        c_hidden_mul: int,
-        c_hidden_pair_att: int,
-        no_heads_msa: int,
-        no_heads_pair: int,
-        no_blocks: int,
-        transition_n: int,
-        msa_dropout: float,
-        pair_dropout: float,
-        inf: float,
-        eps: float,
-        ckpt: bool,
-        clear_cache_between_blocks: bool = False,
-        is_multimer: bool = False,
-        **kwargs,
-    ):
-        super(ExtraMSAStack, self).__init__()
-        
-        self.clear_cache_between_blocks = clear_cache_between_blocks
-        self.blocks = nn.ModuleList()
-        for _ in range(no_blocks):
-            block = ExtraMSABlock(
-                c_m=c_m,
-                c_z=c_z,
-                c_hidden_msa_att=c_hidden_msa_att,
-                c_hidden_opm=c_hidden_opm,
-                c_hidden_mul=c_hidden_mul,
-                c_hidden_pair_att=c_hidden_pair_att,
-                no_heads_msa=no_heads_msa,
-                no_heads_pair=no_heads_pair,
-                transition_n=transition_n,
-                msa_dropout=msa_dropout,
-                pair_dropout=pair_dropout,
-                inf=inf,
-                eps=eps,
-                ckpt=ckpt,
-                is_multimer=is_multimer,
-            )
-            self.blocks.append(block)
-
-    def forward(self,
-        m: torch.Tensor,
-        z: torch.Tensor,
-        chunk_size: int,
-        msa_mask: Optional[torch.Tensor] = None,
-        pair_mask: Optional[torch.Tensor] = None,
-        _mask_trans: bool = True,
-    ) -> torch.Tensor:
-        """
-        Args:
-            m:
-                [*, N_extra, N_res, C_m] extra MSA embedding
-            z:
-                [*, N_res, N_res, C_z] pair embedding
-            msa_mask:
-                Optional [*, N_extra, N_res] MSA mask
-            pair_mask:
-                Optional [*, N_res, N_res] pair mask
-        Returns:
-            [*, N_res, N_res, C_z] pair update
-        """ 
-        #checkpoint_fn = get_checkpoint_fn()
-        #blocks = [
-        #    partial(b, msa_mask=msa_mask, pair_mask=pair_mask, chunk_size=chunk_size, _chunk_logits=None) for b in self.blocks
-        #]
-
-        #def dodo(b, *args):
-        #    torch.cuda.empty_cache()
-        #    return b(*args)
-
-        #blocks = [partial(dodo, b) for b in blocks]
-
-        #for b in blocks:
-        #    if(torch.is_grad_enabled()):
-        #        m, z = checkpoint_fn(b, *(m, z))
-        #    else:
-        #        m, z = b(m, z)
-
-        for b in self.blocks:
-            m, z = b(m, z, msa_mask, pair_mask, chunk_size=chunk_size)
-
-            if(self.clear_cache_between_blocks):
-                torch.cuda.empty_cache()
-
-        return z
\ No newline at end of file

From f5515e9978564bddc0ff97c06c7a6933668e7cef Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 29 Dec 2022 16:55:47 +0800
Subject: [PATCH 073/209] use max_mem to control stratge

---
 chunk_codegen.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 033db50dbccb..1c8be65d490a 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1398,14 +1398,18 @@ def estimate_chunk_inference_mem(
 
 class ChunkSelector(object):
     def __init__(
-        self, index_tracer: IndexTracer, memory_estimator: MemoryEstimator, stratge, max_memory=None
+        self,
+        index_tracer: IndexTracer,
+        memory_estimator: MemoryEstimator,
+        max_memory=None,
     ):
         self.index_tracer = index_tracer
         self.memory_estimator = memory_estimator
-        assert stratge in ["min_memory", "fit_memory"]
-        assert (stratge == "fit_memory" and max_memory is not None) or stratge != "fit_memory"
-        self.stratge = stratge
-        self.max_memory = max_memory  # MB
+        if max_memory is not None:
+            self.stratge = "fit_memory"
+            self.max_memory = max_memory  # MB
+        else:
+            self.stratge = "min_memory"
 
     def _select_best_chunk_region(
         self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
@@ -1538,6 +1542,8 @@ def _select_min_memory_chunk_region(self, possible_chunk_regions, chunk_infos):
             possible_chunk_regions.remove(i)
             max_region_range = 0
             best_region = None
+        if best_region is not None:
+            best_region["chunk_size"] = 2
         return best_region
 
     def _is_legal_region(self, cur_chunk_info, chunk_infos):
@@ -1563,7 +1569,7 @@ def __init__(self, gm, max_memory=None) -> None:
         self.index_tracer.trace_index()
         self.memory_estimator = MemoryEstimator(self.index_tracer)
         self.chunk_selector = ChunkSelector(
-            self.index_tracer, self.memory_estimator, stratge="fit_memory", max_memory=max_memory
+            self.index_tracer, self.memory_estimator, max_memory=max_memory
         )
 
     def _find_peak_node(self, mem_peak):
@@ -2233,7 +2239,7 @@ def emit_node(node: Node, body):
                 delete_unused_values,
                 self.meta_node,
                 self.meta_graph,
-                self.max_memory
+                self.max_memory,
             )
 
             if len(body) == 0:

From e5a5fbb8a94313722542b72f601b8433eef1e5dc Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Sat, 31 Dec 2022 01:00:06 +0800
Subject: [PATCH 074/209] update source add

---
 chunk_codegen.py | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 1c8be65d490a..de58a61b943b 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -133,24 +133,28 @@ def _inherit_all_computation(self, node_from, node_to):
 
     def _add_source(self, node_from, node_from_dim, node_to, node_to_dim, init=False):
         node_from_dim = self._transform_index(node_from, node_from_dim)
-        node_from_trace = self._find_trace_from_node(node_from)
+        node_from_trace_source = self._find_source_trace_from_node(node_from)
         node_to_dim = self._transform_index(node_to, node_to_dim)
-        node_to_trace = self._find_trace_from_node(node_to)
+        node_to_trace_source = self._find_source_trace_from_node(node_to)
         node_from_idx = _find_idx_by_name(node_from.name, self.node_list)
         if init:
-            node_to_trace["source"][node_to_dim] = {}
+            node_to_trace_source[node_to_dim] = {}
         # add dim to cur new source
-        if node_from_idx not in node_to_trace["source"][node_to_dim]:
-            node_to_trace["source"][node_to_dim][node_from_idx] = [node_from_dim]
+        if node_from_idx not in node_to_trace_source[node_to_dim]:
+            node_to_trace_source[node_to_dim][node_from_idx] = [node_from_dim]
         else:
-            if node_from_dim not in node_to_trace["source"][node_to_dim][node_from_idx]:
-                node_to_trace["source"][node_to_dim][node_from_idx].append(
+            if node_from_dim not in node_to_trace_source[node_to_dim][node_from_idx]:
+                node_to_trace_source[node_to_dim][node_from_idx].append(
                     node_from_dim
                 )
         # update inputs source
-        node_to_trace["source"][node_to_dim].update(
-            node_from_trace["source"][node_from_dim]
-        )
+        for node_idx, node_dim in node_from_trace_source[node_from_dim].items():
+            if node_idx not in node_to_trace_source[node_to_dim]:
+                node_to_trace_source[node_to_dim][node_idx] = copy.deepcopy(node_dim)
+            else:
+                for d in node_dim:
+                    if d not in node_to_trace_source[node_to_dim][node_idx]:
+                        node_to_trace_source[node_to_dim][node_idx].append(d)
 
     def _mark_computation_from_node(self, node_from, node_to, exclude=None):
         if exclude == None:
@@ -1761,9 +1765,9 @@ def search_region(self):
             )
             if self._stop_search(init_mem_peak, mem_peak):
                 break
-        # self.memory_estimator.estimate_chunk_inference_mem(
-        #     self.index_tracer.node_list, chunk_infos, print_mem=True
-        # )
+        self.memory_estimator.estimate_chunk_inference_mem(
+            self.index_tracer.node_list, chunk_infos, print_mem=True
+        )
         return chunk_infos
 
 
From 966e4ea0cbf1cd17696aa90b6b9bd4a6999cfba4 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Sat, 31 Dec 2022 02:20:07 +0800
Subject: [PATCH 075/209] add reorder in mem estimator

---
 chunk_codegen.py | 43 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index de58a61b943b..e20d151da1fb 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1040,11 +1040,13 @@ def _reorder_chunk_info(self, chunk_info, reorder_map):
             chunk_info["region"][0] + len(chunk_info["args"]["prepose_nodes"]),
             chunk_info["region"][1],
         )
+        new_inputs_dim = []
         for idx, input_dim in enumerate(chunk_info["inputs_dim"]):
             new_input_dim = {}
             for k, v in input_dim.items():
                 new_input_dim[reorder_map[k]] = v
-            chunk_info["inputs_dim"][idx] = new_input_dim
+            new_inputs_dim.append(new_input_dim)
+        chunk_info["inputs_dim"] = new_inputs_dim
         return chunk_info
 
     def _update_all_reorder_map(self, reorder_map):
@@ -1095,11 +1097,24 @@ def reorder_node_list(self, node_list):
         for old_idx, new_idx in self.all_reorder_map.items():
             new_node_list[new_idx] = node_list[old_idx]
         return new_node_list
+    
+    def tmp_reorder(self, node_list, chunk_info):
+        if len(chunk_info["args"]["prepose_nodes"]) == 0:
+            return node_list, chunk_info
+        reorder_map = self._get_reorder_map(chunk_info)
+        
+        # new tmp node list
+        new_node_list = [None for _ in range(len(node_list))]
+        for old_idx, new_idx in reorder_map.items():
+            new_node_list[new_idx] = node_list[old_idx]
+    
+        chunk_info = self._reorder_chunk_info(chunk_info, reorder_map)
+        return new_node_list, chunk_info
 
 
 class MemoryEstimator(object):
     def __init__(self, index_tracer: IndexTracer) -> None:
-        self.index_tracer = index_tracer
+        pass
 
     def _get_meta_node_size(self, x):
         x = x.meta["tensor_meta"]
@@ -1453,9 +1468,11 @@ def _select_fit_memory_chunk_region(
         # get mem for chunk region
         regions_dict = []
         for region in possible_chunk_regions:
-            cur_chunk_infos = chunk_infos + [region]
+            cur_region = region.copy()
+            cur_node_list, cur_region = self.index_tracer.tmp_reorder(self.index_tracer.node_list, cur_region)
+            cur_chunk_infos = chunk_infos + [cur_region]
             cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
-                self.index_tracer.node_list, cur_chunk_infos
+                cur_node_list, cur_chunk_infos
             )[0]
             cur_chunk_region_peak = cur_mem_peak[
                 max_chunk_region[0] : max_chunk_region[1] + 1
@@ -1492,9 +1509,11 @@ def _get_fit_chunk_size(self, chunk_info, chunk_infos):
         while cur_chunk_max_mem < self.max_memory:
             chunk_size *= 2
             chunk_info["chunk_size"] = chunk_size
-            cur_chunk_infos = chunk_infos + [chunk_info]
+            cur_chunk_info = chunk_info.copy()
+            cur_node_list, cur_chunk_info = self.index_tracer.tmp_reorder(self.index_tracer.node_list, cur_chunk_info)
+            cur_chunk_infos = chunk_infos + [cur_chunk_info]
             cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
-                self.index_tracer.node_list, cur_chunk_infos
+                cur_node_list, cur_chunk_infos
             )[0]
             cur_chunk_max_mem = max(
                 cur_mem_peak[chunk_info["region"][0] : chunk_info["region"][1] + 1]
@@ -1511,11 +1530,13 @@ def _chunk_size_binary_search(self, l, r, chunk_info, chunk_infos):
         else:
             gap = 1
         while r >= l + gap:
-            mid = int(l + (r - l) / 2)
+            mid = int((l + r) / 2 + 0.5)
             chunk_info["chunk_size"] = mid
-            cur_chunk_infos = chunk_infos + [chunk_info]
+            cur_chunk_info = chunk_info.copy()
+            cur_node_list, cur_chunk_info = self.index_tracer.tmp_reorder(self.index_tracer.node_list, cur_chunk_info)
+            cur_chunk_infos = chunk_infos + [cur_chunk_info]
             cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
-                self.index_tracer.node_list, cur_chunk_infos
+                cur_node_list, cur_chunk_infos
             )[0]
             cur_chunk_max_mem = max(
                 cur_mem_peak[chunk_info["region"][0] : chunk_info["region"][1] + 1]
@@ -1529,7 +1550,7 @@ def _chunk_size_binary_search(self, l, r, chunk_info, chunk_infos):
     def _get_compute_node_num(self, start, end):
         count = 0
         for i in self.index_tracer.node_list[start : end + 1]:
-            if _is_non_compute_node(i):
+            if not _is_non_compute_node(i):
                 count += 1
         return count
 
@@ -1547,7 +1568,7 @@ def _select_min_memory_chunk_region(self, possible_chunk_regions, chunk_infos):
             max_region_range = 0
             best_region = None
         if best_region is not None:
-            best_region["chunk_size"] = 2
+            best_region["chunk_size"] = 1
         return best_region
 
     def _is_legal_region(self, cur_chunk_info, chunk_infos):

From 80efd70c725b00c236b80b68393c0d13ec457b0b Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Sat, 31 Dec 2022 13:44:46 +0800
Subject: [PATCH 076/209] improve reorder efficeincy

---
 chunk_codegen.py | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index e20d151da1fb..7c334c617c7b 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1486,6 +1486,8 @@ def _select_fit_memory_chunk_region(
                         "chunk_len": self._get_compute_node_num(
                             region["region"][0], region["region"][1]
                         ),
+                        "reorder_chunk_info": cur_region,
+                        "reorder_node_list": cur_node_list
                     }
                 )
         # no region found
@@ -1495,48 +1497,47 @@ def _select_fit_memory_chunk_region(
         # select the min chunk len
         chunk_len = [i["chunk_len"] for i in regions_dict]
         best_region_idx = chunk_len.index(min(chunk_len))
-        best_region = regions_dict[best_region_idx]["chunk_info"]
+        best_region = regions_dict[best_region_idx]
 
         # get max chunk size
         best_region = self._get_fit_chunk_size(best_region, chunk_infos)
         return best_region
 
-    def _get_fit_chunk_size(self, chunk_info, chunk_infos):
+    def _get_fit_chunk_size(self, chunk_region_dict, chunk_infos):
         chunk_size = 1
-        chunk_info["chunk_size"] = chunk_size
+        reorder_chunk_info = chunk_region_dict['reorder_chunk_info']
+        reorder_chunk_info["chunk_size"] = chunk_size
         cur_chunk_max_mem = 0
         # search a region
         while cur_chunk_max_mem < self.max_memory:
             chunk_size *= 2
-            chunk_info["chunk_size"] = chunk_size
-            cur_chunk_info = chunk_info.copy()
-            cur_node_list, cur_chunk_info = self.index_tracer.tmp_reorder(self.index_tracer.node_list, cur_chunk_info)
-            cur_chunk_infos = chunk_infos + [cur_chunk_info]
+            reorder_chunk_info["chunk_size"] = chunk_size
+            cur_chunk_infos = chunk_infos + [reorder_chunk_info]
             cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
-                cur_node_list, cur_chunk_infos
+                chunk_region_dict['reorder_node_list'], cur_chunk_infos
             )[0]
             cur_chunk_max_mem = max(
-                cur_mem_peak[chunk_info["region"][0] : chunk_info["region"][1] + 1]
+                cur_mem_peak[reorder_chunk_info["region"][0] : reorder_chunk_info["region"][1] + 1]
             )
         # search exact size
+        chunk_info = chunk_region_dict["chunk_info"]
         chunk_info["chunk_size"] = self._chunk_size_binary_search(
-            chunk_size // 2, chunk_size, chunk_info, chunk_infos
+            chunk_size // 2, chunk_size, chunk_region_dict, chunk_infos
         )
         return chunk_info
 
-    def _chunk_size_binary_search(self, l, r, chunk_info, chunk_infos):
+    def _chunk_size_binary_search(self, l, r, chunk_region_dict, chunk_infos):
         if l >= 16:
             gap = 4
         else:
             gap = 1
+        chunk_info = chunk_region_dict['reorder_chunk_info']
         while r >= l + gap:
             mid = int((l + r) / 2 + 0.5)
             chunk_info["chunk_size"] = mid
-            cur_chunk_info = chunk_info.copy()
-            cur_node_list, cur_chunk_info = self.index_tracer.tmp_reorder(self.index_tracer.node_list, cur_chunk_info)
-            cur_chunk_infos = chunk_infos + [cur_chunk_info]
+            cur_chunk_infos = chunk_infos + [chunk_info]
             cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
-                cur_node_list, cur_chunk_infos
+                chunk_region_dict['reorder_node_list'], cur_chunk_infos
             )[0]
             cur_chunk_max_mem = max(
                 cur_mem_peak[chunk_info["region"][0] : chunk_info["region"][1] + 1]
@@ -1904,7 +1905,7 @@ def _find_idx_by_name(name, nodes_list):
 
 
 def _replace_name(context, name_from, name_to):
-    patterns = [(" ", " "), (" ", "."), (" ", ","), ("(", ")"), ("(", ",")]
+    patterns = [(" ", " "), (" ", "."), (" ", ","), ("(", ")"), ("(", ","), (" ", ")")]
     for p in patterns:
         source = p[0] + name_from + p[1]
         target = p[0] + name_to + p[1]

From 5f24f4fd55956904d024d8835029ffcd0cc203a5 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Sat, 31 Dec 2022 16:29:43 +0800
Subject: [PATCH 077/209] support ones_like, add prompt if fit mode search fail

---
 chunk_codegen.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 7c334c617c7b..6f8ff2b23ff0 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1406,9 +1406,9 @@ def estimate_chunk_inference_mem(
             # self._print_mem_log(act_memory_peak_log, node_list, "peak")
             # self._print_mem_log(act_memory_after_node_log, node_list, "after")
             self._print_compute_op_mem_log(act_memory_peak_log, node_list, "peak")
-            self._print_compute_op_mem_log(
-                act_memory_after_node_log, node_list, "after"
-            )
+            # self._print_compute_op_mem_log(
+            #     act_memory_after_node_log, node_list, "after"
+            # )
 
         # param_memory = parameter_size(gm)
         # all_memory = act_memory + param_memory
@@ -1465,6 +1465,9 @@ def _select_fit_memory_chunk_region(
             if i in possible_chunk_regions:
                 possible_chunk_regions.remove(i)
 
+        if len(possible_chunk_regions) == 0:
+            return None
+
         # get mem for chunk region
         regions_dict = []
         for region in possible_chunk_regions:
@@ -1492,7 +1495,7 @@ def _select_fit_memory_chunk_region(
                 )
         # no region found
         if len(regions_dict) == 0:
-            return None
+            raise RuntimeError("Search failed. Try a larger memory threshold.")
 
         # select the min chunk len
         chunk_len = [i["chunk_len"] for i in regions_dict]
@@ -1995,6 +1998,14 @@ def emit_code_with_chunk(
                         body[-1] = _replace_name(
                             body[-1], input_node.name, input_node.name + chunk_slice
                         )
+            # ones like
+            if "ones_like" in node.name:
+                chunk_slice = _gen_chunk_slice_dim(
+                    chunk_search[region_idx]["node_chunk_dim"][chunk_region_search.index_tracer.node_list[node_idx]]["chunk_dim"], "chunk_idx", _get_node_shape(node)
+                )
+                body[-1] = _replace_name(
+                    body[-1], node.args[0].name, node.args[0].name + chunk_slice
+                )
             body[-1] = _replace_reshape_size(
                 body[-1], node.name, chunk_search[region_idx]["reshape_size"]
             )

From 7fd3b45af21345cff9334682e277d7669c730814 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 2 Jan 2023 00:04:47 +0800
Subject: [PATCH 078/209] fix a bug in ones like, dont gen chunk if dim size is
 1

---
 autochunk_benchmark.py |  4 ++--
 chunk_codegen.py       | 41 +++++++++++++++++++++++++++--------------
 2 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/autochunk_benchmark.py b/autochunk_benchmark.py
index 679016438c59..3b48d7e461fe 100644
--- a/autochunk_benchmark.py
+++ b/autochunk_benchmark.py
@@ -16,9 +16,9 @@ def _benchmark_evoformer(model: torch.nn.Module, node, pair, title, chunk_size=N
     torch.cuda.reset_peak_memory_stats()
     now_mem = torch.cuda.memory_allocated() / 1024**2
 
-    loop = 16
+    loop = 3
     with torch.no_grad():
-        for _ in range(loop // 4):
+        for _ in range(loop // 2 + 1):
             if chunk_size:
                 model(node, pair, chunk_size)
             else:
diff --git a/chunk_codegen.py b/chunk_codegen.py
index 6f8ff2b23ff0..6f21f26f37e1 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -144,9 +144,7 @@ def _add_source(self, node_from, node_from_dim, node_to, node_to_dim, init=False
             node_to_trace_source[node_to_dim][node_from_idx] = [node_from_dim]
         else:
             if node_from_dim not in node_to_trace_source[node_to_dim][node_from_idx]:
-                node_to_trace_source[node_to_dim][node_from_idx].append(
-                    node_from_dim
-                )
+                node_to_trace_source[node_to_dim][node_from_idx].append(node_from_dim)
         # update inputs source
         for node_idx, node_dim in node_from_trace_source[node_from_dim].items():
             if node_idx not in node_to_trace_source[node_to_dim]:
@@ -1097,17 +1095,17 @@ def reorder_node_list(self, node_list):
         for old_idx, new_idx in self.all_reorder_map.items():
             new_node_list[new_idx] = node_list[old_idx]
         return new_node_list
-    
+
     def tmp_reorder(self, node_list, chunk_info):
         if len(chunk_info["args"]["prepose_nodes"]) == 0:
             return node_list, chunk_info
         reorder_map = self._get_reorder_map(chunk_info)
-        
+
         # new tmp node list
         new_node_list = [None for _ in range(len(node_list))]
         for old_idx, new_idx in reorder_map.items():
             new_node_list[new_idx] = node_list[old_idx]
-    
+
         chunk_info = self._reorder_chunk_info(chunk_info, reorder_map)
         return new_node_list, chunk_info
 
@@ -1472,7 +1470,9 @@ def _select_fit_memory_chunk_region(
         regions_dict = []
         for region in possible_chunk_regions:
             cur_region = region.copy()
-            cur_node_list, cur_region = self.index_tracer.tmp_reorder(self.index_tracer.node_list, cur_region)
+            cur_node_list, cur_region = self.index_tracer.tmp_reorder(
+                self.index_tracer.node_list, cur_region
+            )
             cur_chunk_infos = chunk_infos + [cur_region]
             cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
                 cur_node_list, cur_chunk_infos
@@ -1490,7 +1490,7 @@ def _select_fit_memory_chunk_region(
                             region["region"][0], region["region"][1]
                         ),
                         "reorder_chunk_info": cur_region,
-                        "reorder_node_list": cur_node_list
+                        "reorder_node_list": cur_node_list,
                     }
                 )
         # no region found
@@ -1508,7 +1508,7 @@ def _select_fit_memory_chunk_region(
 
     def _get_fit_chunk_size(self, chunk_region_dict, chunk_infos):
         chunk_size = 1
-        reorder_chunk_info = chunk_region_dict['reorder_chunk_info']
+        reorder_chunk_info = chunk_region_dict["reorder_chunk_info"]
         reorder_chunk_info["chunk_size"] = chunk_size
         cur_chunk_max_mem = 0
         # search a region
@@ -1517,10 +1517,13 @@ def _get_fit_chunk_size(self, chunk_region_dict, chunk_infos):
             reorder_chunk_info["chunk_size"] = chunk_size
             cur_chunk_infos = chunk_infos + [reorder_chunk_info]
             cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
-                chunk_region_dict['reorder_node_list'], cur_chunk_infos
+                chunk_region_dict["reorder_node_list"], cur_chunk_infos
             )[0]
             cur_chunk_max_mem = max(
-                cur_mem_peak[reorder_chunk_info["region"][0] : reorder_chunk_info["region"][1] + 1]
+                cur_mem_peak[
+                    reorder_chunk_info["region"][0] : reorder_chunk_info["region"][1]
+                    + 1
+                ]
             )
         # search exact size
         chunk_info = chunk_region_dict["chunk_info"]
@@ -1534,13 +1537,13 @@ def _chunk_size_binary_search(self, l, r, chunk_region_dict, chunk_infos):
             gap = 4
         else:
             gap = 1
-        chunk_info = chunk_region_dict['reorder_chunk_info']
+        chunk_info = chunk_region_dict["reorder_chunk_info"]
         while r >= l + gap:
             mid = int((l + r) / 2 + 0.5)
             chunk_info["chunk_size"] = mid
             cur_chunk_infos = chunk_infos + [chunk_info]
             cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
-                chunk_region_dict['reorder_node_list'], cur_chunk_infos
+                chunk_region_dict["reorder_node_list"], cur_chunk_infos
             )[0]
             cur_chunk_max_mem = max(
                 cur_mem_peak[chunk_info["region"][0] : chunk_info["region"][1] + 1]
@@ -2000,8 +2003,18 @@ def emit_code_with_chunk(
                         )
             # ones like
             if "ones_like" in node.name:
+                chunk_dim = chunk_search[region_idx]["node_chunk_dim"][
+                    chunk_region_search.index_tracer.node_list[node_idx]
+                ]["chunk_dim"]
+                if (
+                    _get_node_shape(
+                        chunk_region_search.index_tracer.node_list[node_idx]
+                    )[chunk_dim]
+                    == 1
+                ):
+                    continue
                 chunk_slice = _gen_chunk_slice_dim(
-                    chunk_search[region_idx]["node_chunk_dim"][chunk_region_search.index_tracer.node_list[node_idx]]["chunk_dim"], "chunk_idx", _get_node_shape(node)
+                    chunk_dim, "chunk_idx", _get_node_shape(node)
                 )
                 body[-1] = _replace_name(
                     body[-1], node.args[0].name, node.args[0].name + chunk_slice

From 9c5e028a62b003136d2402b99b728eaefcc528cd Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 2 Jan 2023 00:27:11 +0800
Subject: [PATCH 079/209] fix bug again

---
 chunk_codegen.py | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 6f21f26f37e1..21ecc343a959 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -2003,22 +2003,25 @@ def emit_code_with_chunk(
                         )
             # ones like
             if "ones_like" in node.name:
-                chunk_dim = chunk_search[region_idx]["node_chunk_dim"][
-                    chunk_region_search.index_tracer.node_list[node_idx]
-                ]["chunk_dim"]
-                if (
-                    _get_node_shape(
-                        chunk_region_search.index_tracer.node_list[node_idx]
-                    )[chunk_dim]
-                    == 1
-                ):
-                    continue
-                chunk_slice = _gen_chunk_slice_dim(
-                    chunk_dim, "chunk_idx", _get_node_shape(node)
-                )
-                body[-1] = _replace_name(
-                    body[-1], node.args[0].name, node.args[0].name + chunk_slice
-                )
+                meta_node = chunk_region_search.index_tracer.node_list[node_idx]
+                chunk_dim = chunk_search[region_idx]["node_chunk_dim"][meta_node][
+                    "chunk_dim"
+                ]
+                if _get_node_shape(meta_node)[chunk_dim] != 1:
+                    source_node = meta_node.args[0].args[0]
+                    if (
+                        source_node not in chunk_search[region_idx]["node_chunk_dim"]
+                        or chunk_search[region_idx]["node_chunk_dim"][source_node][
+                            "chunk_dim"
+                        ]
+                        is None
+                    ):
+                        chunk_slice = _gen_chunk_slice_dim(
+                            chunk_dim, "chunk_idx", _get_node_shape(node)
+                        )
+                        body[-1] = _replace_name(
+                            body[-1], node.args[0].name, node.args[0].name + chunk_slice
+                        )
             body[-1] = _replace_reshape_size(
                 body[-1], node.name, chunk_search[region_idx]["reshape_size"]
             )

From 55cb713f36e8080313225577dde97e4d35e18108 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 5 Jan 2023 11:29:22 +0800
Subject: [PATCH 080/209] update min memory stratege, reduce mem usage by 30%

---
 chunk_codegen.py | 65 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 51 insertions(+), 14 deletions(-)

diff --git a/chunk_codegen.py b/chunk_codegen.py
index 21ecc343a959..41fcb5a3c2f4 100644
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@@ -1433,7 +1433,11 @@ def _select_best_chunk_region(
     ):
         if self.stratge == "min_memory":
             best_region = self._select_min_memory_chunk_region(
-                possible_chunk_regions, chunk_infos
+                possible_chunk_regions,
+                chunk_infos,
+                peak_node,
+                max_chunk_region,
+                mem_peak,
             )
         elif self.stratge == "fit_memory":
             best_region = self._select_fit_memory_chunk_region(
@@ -1561,19 +1565,52 @@ def _get_compute_node_num(self, start, end):
                 count += 1
         return count
 
-    def _select_min_memory_chunk_region(self, possible_chunk_regions, chunk_infos):
-        max_region_range = 0
-        best_region = None
-        while len(possible_chunk_regions) > 0:
-            for i in possible_chunk_regions:
-                if i["region"][1] - i["region"][0] > max_region_range:
-                    best_region = i
-                    max_region_range = i["region"][1] - i["region"][0]
-            if self._is_legal_region(best_region, chunk_infos):
-                break
-            possible_chunk_regions.remove(i)
-            max_region_range = 0
-            best_region = None
+    def _select_min_memory_chunk_region(
+        self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
+    ):
+        # remove illegal regions
+        illegal_regions = []
+        for i in possible_chunk_regions:
+            if not self._is_legal_region(i, chunk_infos):
+                illegal_regions.append(i)
+        for i in illegal_regions:
+            if i in possible_chunk_regions:
+                possible_chunk_regions.remove(i)
+
+        if len(possible_chunk_regions) == 0:
+            return None
+
+        # get mem for chunk region
+        regions_dict = []
+        for region in possible_chunk_regions:
+            cur_region = region.copy()
+            cur_node_list, cur_region = self.index_tracer.tmp_reorder(
+                self.index_tracer.node_list, cur_region
+            )
+            cur_chunk_infos = chunk_infos + [cur_region]
+            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
+                cur_node_list, cur_chunk_infos
+            )[0]
+            cur_chunk_region_peak = cur_mem_peak[
+                max_chunk_region[0] : max_chunk_region[1] + 1
+            ]
+            cur_chunk_region_max_peak = max(cur_chunk_region_peak)
+            regions_dict.append(
+                {
+                    "chunk_info": region,
+                    "chunk_max_mem": cur_chunk_region_max_peak,
+                    "chunk_len": self._get_compute_node_num(
+                        region["region"][0], region["region"][1]
+                    ),
+                    "reorder_chunk_info": cur_region,
+                    "reorder_node_list": cur_node_list,
+                }
+            )
+
+        # select the min mem
+        chunk_max_mem = [i["chunk_max_mem"] for i in regions_dict]
+        best_region_idx = chunk_max_mem.index(min(chunk_max_mem))
+        best_region = regions_dict[best_region_idx]["chunk_info"]
         if best_region is not None:
             best_region["chunk_size"] = 1
         return best_region

From 71e72c48907195096ef02be73e1c5b0feea2653d Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 5 Jan 2023 17:54:25 +0800
Subject: [PATCH 081/209] last version of benchmark

---
 autochunk_benchmark.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/autochunk_benchmark.py b/autochunk_benchmark.py
index 3b48d7e461fe..c938485efc05 100644
--- a/autochunk_benchmark.py
+++ b/autochunk_benchmark.py
@@ -93,22 +93,24 @@ def _build_openfold():
 
 def benchmark_evoformer():
     # init data and model
-    msa_len = 300
-    pair_len = 800
+    msa_len = 256
+    pair_len = 2048
     node = torch.randn(1, msa_len, pair_len, 256).cuda()
     pair = torch.randn(1, pair_len, pair_len, 128).cuda()
     model = evoformer_base().cuda()
 
     # build autochunk model
-    max_memory = 3000  # MB
+    max_memory = 10000  # MB fit memory mode
+    # max_memory = None  # min memory mode
     autochunk = _build_autochunk(evoformer_base().cuda(), max_memory, node, pair)
 
     # build openfold
+    chunk_size = 64
     openfold = _build_openfold()
 
     # benchmark
     _benchmark_evoformer(model, node, pair, "base")
-    _benchmark_evoformer(openfold, node, pair, "openfold", chunk_size=4)
+    _benchmark_evoformer(openfold, node, pair, "openfold", chunk_size=chunk_size)
     _benchmark_evoformer(autochunk, node, pair, "autochunk")
 
 
From 27ab5240965fc9cc0ec74ff48356abcbf098bd74 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 11:07:57 +0800
Subject: [PATCH 082/209] refactor structure

---
 .../chunk_codegen.py                          | 41 ++++++++-----------
 .../evoformer}/evoformer.py                   |  0
 .../evoformer}/initializer.py                 |  0
 {evoformer => autochunk/evoformer}/kernel.py  |  0
 {evoformer => autochunk/evoformer}/msa.py     |  0
 {evoformer => autochunk/evoformer}/ops.py     |  0
 .../evoformer}/triangle.py                    |  0
 .../openfold}/checkpointing.py                |  0
 {openfold => autochunk/openfold}/dropout.py   |  0
 {openfold => autochunk/openfold}/evoformer.py |  0
 {openfold => autochunk/openfold}/msa.py       |  0
 .../openfold}/outer_product_mean.py           |  0
 .../openfold}/pair_transition.py              |  0
 .../openfold}/primitives.py                   |  0
 .../openfold}/tensor_utils.py                 |  0
 .../openfold}/triangular_attention.py         |  0
 .../triangular_multiplicative_update.py       |  0
 autochunk_benchmark.py                        | 18 ++++----
 chunk_codegen_run.py => autochunk_test.py     |  4 +-
 19 files changed, 29 insertions(+), 34 deletions(-)
 rename chunk_codegen.py => autochunk/chunk_codegen.py (98%)
 rename {evoformer => autochunk/evoformer}/evoformer.py (100%)
 rename {evoformer => autochunk/evoformer}/initializer.py (100%)
 rename {evoformer => autochunk/evoformer}/kernel.py (100%)
 rename {evoformer => autochunk/evoformer}/msa.py (100%)
 rename {evoformer => autochunk/evoformer}/ops.py (100%)
 rename {evoformer => autochunk/evoformer}/triangle.py (100%)
 rename {openfold => autochunk/openfold}/checkpointing.py (100%)
 rename {openfold => autochunk/openfold}/dropout.py (100%)
 rename {openfold => autochunk/openfold}/evoformer.py (100%)
 rename {openfold => autochunk/openfold}/msa.py (100%)
 rename {openfold => autochunk/openfold}/outer_product_mean.py (100%)
 rename {openfold => autochunk/openfold}/pair_transition.py (100%)
 rename {openfold => autochunk/openfold}/primitives.py (100%)
 rename {openfold => autochunk/openfold}/tensor_utils.py (100%)
 rename {openfold => autochunk/openfold}/triangular_attention.py (100%)
 rename {openfold => autochunk/openfold}/triangular_multiplicative_update.py (100%)
 rename chunk_codegen_run.py => autochunk_test.py (97%)

diff --git a/chunk_codegen.py b/autochunk/chunk_codegen.py
similarity index 98%
rename from chunk_codegen.py
rename to autochunk/chunk_codegen.py
index 41fcb5a3c2f4..7a5d06689247 100644
--- a/chunk_codegen.py
+++ b/autochunk/chunk_codegen.py
@@ -1967,13 +1967,11 @@ def _replace_reshape_size(context, node_name, reshape_size_dict):
 
 def emit_code_with_chunk(
     body,
-    ckpt_func,
     nodes,
     emit_node_func,
     delete_unused_value_func,
-    meta_nodes,
-    meta_graph,
-    max_memory=None,
+    chunk_region_search,
+    chunk_infos
 ):
     """Emit code with nested activation checkpoint
     When we detect some of the node.activation_checkpoint is a List, we will use
@@ -1988,23 +1986,19 @@ def emit_code_with_chunk(
     """
     node_list = list(nodes)
 
-    # find the chunk regions
-    chunk_region_search = ChunkRegionSearch(meta_graph, max_memory)
-    chunk_search = chunk_region_search.search_region()
-
-    chunk_regions = [i["region"] for i in chunk_search]
+    chunk_regions = [i["region"] for i in chunk_infos]
     chunk_starts = [i[0] for i in chunk_regions]
     chunk_ends = [i[1] for i in chunk_regions]
 
-    chunk_inputs = [i["inputs"] for i in chunk_search]
-    chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_search]
-    chunk_inputs_dim = [i["inputs_dim"] for i in chunk_search]
+    chunk_inputs = [i["inputs"] for i in chunk_infos]
+    chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos]
+    chunk_inputs_dim = [i["inputs_dim"] for i in chunk_infos]
     chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [
         j.name for i in chunk_inputs_non_chunk for j in i
     ]
 
-    chunk_outputs = [i["outputs"][0] for i in chunk_search]
-    chunk_outputs_dim = [i["outputs_dim"] for i in chunk_search]
+    chunk_outputs = [i["outputs"][0] for i in chunk_infos]
+    chunk_outputs_dim = [i["outputs_dim"] for i in chunk_infos]
 
     node_list = chunk_region_search.index_tracer.reorder_node_list(node_list)
     node_idx = 0
@@ -2022,7 +2016,7 @@ def emit_code_with_chunk(
                     chunk_inputs[region_idx],
                     chunk_outputs[region_idx],
                     chunk_outputs_dim[region_idx],
-                    chunk_search[region_idx]["chunk_size"],
+                    chunk_infos[region_idx]["chunk_size"],
                 )
             )
 
@@ -2041,14 +2035,14 @@ def emit_code_with_chunk(
             # ones like
             if "ones_like" in node.name:
                 meta_node = chunk_region_search.index_tracer.node_list[node_idx]
-                chunk_dim = chunk_search[region_idx]["node_chunk_dim"][meta_node][
+                chunk_dim = chunk_infos[region_idx]["node_chunk_dim"][meta_node][
                     "chunk_dim"
                 ]
                 if _get_node_shape(meta_node)[chunk_dim] != 1:
                     source_node = meta_node.args[0].args[0]
                     if (
-                        source_node not in chunk_search[region_idx]["node_chunk_dim"]
-                        or chunk_search[region_idx]["node_chunk_dim"][source_node][
+                        source_node not in chunk_infos[region_idx]["node_chunk_dim"]
+                        or chunk_infos[region_idx]["node_chunk_dim"][source_node][
                             "chunk_dim"
                         ]
                         is None
@@ -2060,7 +2054,7 @@ def emit_code_with_chunk(
                             body[-1], node.args[0].name, node.args[0].name + chunk_slice
                         )
             body[-1] = _replace_reshape_size(
-                body[-1], node.name, chunk_search[region_idx]["reshape_size"]
+                body[-1], node.name, chunk_infos[region_idx]["reshape_size"]
             )
             body[-1] = "    " + body[-1]
             delete_unused_value_func(node, body, chunk_inputs_names)
@@ -2092,6 +2086,9 @@ def __init__(self, meta_graph, max_memory=None):
             self.meta_graph = meta_graph
             self.max_memory = max_memory
             self.meta_node = list(meta_graph.graph.nodes)
+            # find the chunk regions
+            self.chunk_region_search = ChunkRegionSearch(meta_graph, max_memory)
+            self.chunk_infos = self.chunk_region_search.search_region()
 
         def _gen_python_code(
             self, nodes, root_module: str, namespace: _Namespace
@@ -2323,13 +2320,11 @@ def emit_node(node: Node, body):
             # will use nested type of activation checkpoint codegen
             emit_code_with_chunk(
                 body,
-                ckpt_func,
                 nodes,
                 emit_node,
                 delete_unused_values,
-                self.meta_node,
-                self.meta_graph,
-                self.max_memory,
+                self.chunk_region_search,
+                self.chunk_infos
             )
 
             if len(body) == 0:
diff --git a/evoformer/evoformer.py b/autochunk/evoformer/evoformer.py
similarity index 100%
rename from evoformer/evoformer.py
rename to autochunk/evoformer/evoformer.py
diff --git a/evoformer/initializer.py b/autochunk/evoformer/initializer.py
similarity index 100%
rename from evoformer/initializer.py
rename to autochunk/evoformer/initializer.py
diff --git a/evoformer/kernel.py b/autochunk/evoformer/kernel.py
similarity index 100%
rename from evoformer/kernel.py
rename to autochunk/evoformer/kernel.py
diff --git a/evoformer/msa.py b/autochunk/evoformer/msa.py
similarity index 100%
rename from evoformer/msa.py
rename to autochunk/evoformer/msa.py
diff --git a/evoformer/ops.py b/autochunk/evoformer/ops.py
similarity index 100%
rename from evoformer/ops.py
rename to autochunk/evoformer/ops.py
diff --git a/evoformer/triangle.py b/autochunk/evoformer/triangle.py
similarity index 100%
rename from evoformer/triangle.py
rename to autochunk/evoformer/triangle.py
diff --git a/openfold/checkpointing.py b/autochunk/openfold/checkpointing.py
similarity index 100%
rename from openfold/checkpointing.py
rename to autochunk/openfold/checkpointing.py
diff --git a/openfold/dropout.py b/autochunk/openfold/dropout.py
similarity index 100%
rename from openfold/dropout.py
rename to autochunk/openfold/dropout.py
diff --git a/openfold/evoformer.py b/autochunk/openfold/evoformer.py
similarity index 100%
rename from openfold/evoformer.py
rename to autochunk/openfold/evoformer.py
diff --git a/openfold/msa.py b/autochunk/openfold/msa.py
similarity index 100%
rename from openfold/msa.py
rename to autochunk/openfold/msa.py
diff --git a/openfold/outer_product_mean.py b/autochunk/openfold/outer_product_mean.py
similarity index 100%
rename from openfold/outer_product_mean.py
rename to autochunk/openfold/outer_product_mean.py
diff --git a/openfold/pair_transition.py b/autochunk/openfold/pair_transition.py
similarity index 100%
rename from openfold/pair_transition.py
rename to autochunk/openfold/pair_transition.py
diff --git a/openfold/primitives.py b/autochunk/openfold/primitives.py
similarity index 100%
rename from openfold/primitives.py
rename to autochunk/openfold/primitives.py
diff --git a/openfold/tensor_utils.py b/autochunk/openfold/tensor_utils.py
similarity index 100%
rename from openfold/tensor_utils.py
rename to autochunk/openfold/tensor_utils.py
diff --git a/openfold/triangular_attention.py b/autochunk/openfold/triangular_attention.py
similarity index 100%
rename from openfold/triangular_attention.py
rename to autochunk/openfold/triangular_attention.py
diff --git a/openfold/triangular_multiplicative_update.py b/autochunk/openfold/triangular_multiplicative_update.py
similarity index 100%
rename from openfold/triangular_multiplicative_update.py
rename to autochunk/openfold/triangular_multiplicative_update.py
diff --git a/autochunk_benchmark.py b/autochunk_benchmark.py
index c938485efc05..c34b5217e5d4 100644
--- a/autochunk_benchmark.py
+++ b/autochunk_benchmark.py
@@ -3,13 +3,13 @@
 import torch
 import torch.fx
 
-from chunk_codegen import ChunkCodeGen
+from autochunk.chunk_codegen import ChunkCodeGen
 from colossalai.fx import ColoTracer
 from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 from colossalai.fx.profiler import MetaTensor
-from evoformer.evoformer import evoformer_base
-from openfold.evoformer import EvoformerBlock
+from autochunk.evoformer.evoformer import evoformer_base
+from autochunk.openfold.evoformer import EvoformerBlock
 
 
 def _benchmark_evoformer(model: torch.nn.Module, node, pair, title, chunk_size=None):
@@ -94,23 +94,23 @@ def _build_openfold():
 def benchmark_evoformer():
     # init data and model
     msa_len = 256
-    pair_len = 2048
+    pair_len = 1024
     node = torch.randn(1, msa_len, pair_len, 256).cuda()
     pair = torch.randn(1, pair_len, pair_len, 128).cuda()
     model = evoformer_base().cuda()
 
     # build autochunk model
-    max_memory = 10000  # MB fit memory mode
-    # max_memory = None  # min memory mode
+    # max_memory = 10000  # MB fit memory mode
+    max_memory = None  # min memory mode
     autochunk = _build_autochunk(evoformer_base().cuda(), max_memory, node, pair)
 
     # build openfold
     chunk_size = 64
-    openfold = _build_openfold()
+    # openfold = _build_openfold()
 
     # benchmark
-    _benchmark_evoformer(model, node, pair, "base")
-    _benchmark_evoformer(openfold, node, pair, "openfold", chunk_size=chunk_size)
+    # _benchmark_evoformer(model, node, pair, "base")
+    # _benchmark_evoformer(openfold, node, pair, "openfold", chunk_size=chunk_size)
     _benchmark_evoformer(autochunk, node, pair, "autochunk")
 
 
diff --git a/chunk_codegen_run.py b/autochunk_test.py
similarity index 97%
rename from chunk_codegen_run.py
rename to autochunk_test.py
index 3a3b3c599e3e..63f393531d5c 100644
--- a/chunk_codegen_run.py
+++ b/autochunk_test.py
@@ -12,8 +12,8 @@
 from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp, TensorMetadata
 from colossalai.fx.profiler import MetaTensor
-from evoformer.evoformer import evoformer_base
-from chunk_codegen import ChunkCodeGen
+from autochunk.evoformer.evoformer import evoformer_base
+from autochunk.chunk_codegen import ChunkCodeGen
 with_codegen = True
 
 
From efb1c64c30cf2ee35dad03bfd3829f014d204a8d Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 11:39:26 +0800
Subject: [PATCH 083/209] restruct dir

---
 .../autochunk}/chunk_codegen.py                |  0
 .../test_autochunk/autochunk_benchmark.py      | 14 +++++++-------
 .../test_autochunk/autochunk_test.py           |  4 ++--
 .../test_autochunk}/evoformer/evoformer.py     |  0
 .../test_autochunk}/evoformer/initializer.py   |  0
 .../test_autochunk}/evoformer/kernel.py        |  0
 .../test_autochunk}/evoformer/msa.py           |  0
 .../test_autochunk}/evoformer/ops.py           |  0
 .../test_autochunk}/evoformer/triangle.py      |  0
 .../test_autochunk}/openfold/checkpointing.py  |  0
 .../test_autochunk}/openfold/dropout.py        |  0
 .../test_autochunk}/openfold/evoformer.py      | 18 +++++++++---------
 .../test_autochunk}/openfold/msa.py            |  6 +++---
 .../openfold/outer_product_mean.py             |  4 ++--
 .../openfold/pair_transition.py                |  4 ++--
 .../test_autochunk}/openfold/primitives.py     |  4 ++--
 .../test_autochunk}/openfold/tensor_utils.py   |  0
 .../openfold/triangular_attention.py           |  4 ++--
 .../triangular_multiplicative_update.py        |  4 ++--
 19 files changed, 31 insertions(+), 31 deletions(-)
 rename {autochunk => colossalai/autochunk}/chunk_codegen.py (100%)
 rename autochunk_benchmark.py => tests/test_autochunk/autochunk_benchmark.py (89%)
 rename autochunk_test.py => tests/test_autochunk/autochunk_test.py (96%)
 rename {autochunk => tests/test_autochunk}/evoformer/evoformer.py (100%)
 rename {autochunk => tests/test_autochunk}/evoformer/initializer.py (100%)
 rename {autochunk => tests/test_autochunk}/evoformer/kernel.py (100%)
 rename {autochunk => tests/test_autochunk}/evoformer/msa.py (100%)
 rename {autochunk => tests/test_autochunk}/evoformer/ops.py (100%)
 rename {autochunk => tests/test_autochunk}/evoformer/triangle.py (100%)
 rename {autochunk => tests/test_autochunk}/openfold/checkpointing.py (100%)
 rename {autochunk => tests/test_autochunk}/openfold/dropout.py (100%)
 rename {autochunk => tests/test_autochunk}/openfold/evoformer.py (96%)
 rename {autochunk => tests/test_autochunk}/openfold/msa.py (98%)
 rename {autochunk => tests/test_autochunk}/openfold/outer_product_mean.py (97%)
 rename {autochunk => tests/test_autochunk}/openfold/pair_transition.py (96%)
 rename {autochunk => tests/test_autochunk}/openfold/primitives.py (99%)
 rename {autochunk => tests/test_autochunk}/openfold/tensor_utils.py (100%)
 rename {autochunk => tests/test_autochunk}/openfold/triangular_attention.py (97%)
 rename {autochunk => tests/test_autochunk}/openfold/triangular_multiplicative_update.py (97%)

diff --git a/autochunk/chunk_codegen.py b/colossalai/autochunk/chunk_codegen.py
similarity index 100%
rename from autochunk/chunk_codegen.py
rename to colossalai/autochunk/chunk_codegen.py
diff --git a/autochunk_benchmark.py b/tests/test_autochunk/autochunk_benchmark.py
similarity index 89%
rename from autochunk_benchmark.py
rename to tests/test_autochunk/autochunk_benchmark.py
index c34b5217e5d4..8df6d9ff4564 100644
--- a/autochunk_benchmark.py
+++ b/tests/test_autochunk/autochunk_benchmark.py
@@ -3,13 +3,13 @@
 import torch
 import torch.fx
 
-from autochunk.chunk_codegen import ChunkCodeGen
+from colossalai.autochunk.chunk_codegen import ChunkCodeGen
 from colossalai.fx import ColoTracer
 from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 from colossalai.fx.profiler import MetaTensor
-from autochunk.evoformer.evoformer import evoformer_base
-from autochunk.openfold.evoformer import EvoformerBlock
+from tests.test_autochunk.evoformer.evoformer import evoformer_base
+from tests.test_autochunk.openfold.evoformer import EvoformerBlock
 
 
 def _benchmark_evoformer(model: torch.nn.Module, node, pair, title, chunk_size=None):
@@ -94,7 +94,7 @@ def _build_openfold():
 def benchmark_evoformer():
     # init data and model
     msa_len = 256
-    pair_len = 1024
+    pair_len = 256
     node = torch.randn(1, msa_len, pair_len, 256).cuda()
     pair = torch.randn(1, pair_len, pair_len, 128).cuda()
     model = evoformer_base().cuda()
@@ -106,11 +106,11 @@ def benchmark_evoformer():
 
     # build openfold
     chunk_size = 64
-    # openfold = _build_openfold()
+    openfold = _build_openfold()
 
     # benchmark
-    # _benchmark_evoformer(model, node, pair, "base")
-    # _benchmark_evoformer(openfold, node, pair, "openfold", chunk_size=chunk_size)
+    _benchmark_evoformer(model, node, pair, "base")
+    _benchmark_evoformer(openfold, node, pair, "openfold", chunk_size=chunk_size)
     _benchmark_evoformer(autochunk, node, pair, "autochunk")
 
 
diff --git a/autochunk_test.py b/tests/test_autochunk/autochunk_test.py
similarity index 96%
rename from autochunk_test.py
rename to tests/test_autochunk/autochunk_test.py
index 63f393531d5c..5e9aaca15f9f 100644
--- a/autochunk_test.py
+++ b/tests/test_autochunk/autochunk_test.py
@@ -12,8 +12,8 @@
 from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp, TensorMetadata
 from colossalai.fx.profiler import MetaTensor
-from autochunk.evoformer.evoformer import evoformer_base
-from autochunk.chunk_codegen import ChunkCodeGen
+from tests.test_autochunk.evoformer.evoformer import evoformer_base
+from ...colossalai.autochunk.chunk_codegen import ChunkCodeGen
 with_codegen = True
 
 
diff --git a/autochunk/evoformer/evoformer.py b/tests/test_autochunk/evoformer/evoformer.py
similarity index 100%
rename from autochunk/evoformer/evoformer.py
rename to tests/test_autochunk/evoformer/evoformer.py
diff --git a/autochunk/evoformer/initializer.py b/tests/test_autochunk/evoformer/initializer.py
similarity index 100%
rename from autochunk/evoformer/initializer.py
rename to tests/test_autochunk/evoformer/initializer.py
diff --git a/autochunk/evoformer/kernel.py b/tests/test_autochunk/evoformer/kernel.py
similarity index 100%
rename from autochunk/evoformer/kernel.py
rename to tests/test_autochunk/evoformer/kernel.py
diff --git a/autochunk/evoformer/msa.py b/tests/test_autochunk/evoformer/msa.py
similarity index 100%
rename from autochunk/evoformer/msa.py
rename to tests/test_autochunk/evoformer/msa.py
diff --git a/autochunk/evoformer/ops.py b/tests/test_autochunk/evoformer/ops.py
similarity index 100%
rename from autochunk/evoformer/ops.py
rename to tests/test_autochunk/evoformer/ops.py
diff --git a/autochunk/evoformer/triangle.py b/tests/test_autochunk/evoformer/triangle.py
similarity index 100%
rename from autochunk/evoformer/triangle.py
rename to tests/test_autochunk/evoformer/triangle.py
diff --git a/autochunk/openfold/checkpointing.py b/tests/test_autochunk/openfold/checkpointing.py
similarity index 100%
rename from autochunk/openfold/checkpointing.py
rename to tests/test_autochunk/openfold/checkpointing.py
diff --git a/autochunk/openfold/dropout.py b/tests/test_autochunk/openfold/dropout.py
similarity index 100%
rename from autochunk/openfold/dropout.py
rename to tests/test_autochunk/openfold/dropout.py
diff --git a/autochunk/openfold/evoformer.py b/tests/test_autochunk/openfold/evoformer.py
similarity index 96%
rename from autochunk/openfold/evoformer.py
rename to tests/test_autochunk/openfold/evoformer.py
index ffd4c982987a..b53ec1aa51e5 100644
--- a/autochunk/openfold/evoformer.py
+++ b/tests/test_autochunk/openfold/evoformer.py
@@ -19,25 +19,25 @@
 from typing import Tuple, Optional
 from functools import partial
 
-from openfold.primitives import Linear, LayerNorm
-from openfold.dropout import DropoutRowwise, DropoutColumnwise
-from openfold.msa import (
+from .primitives import Linear, LayerNorm
+from .dropout import DropoutRowwise, DropoutColumnwise
+from .msa import (
     MSARowAttentionWithPairBias,
     MSAColumnAttention,
     MSAColumnGlobalAttention,
 )
-from openfold.outer_product_mean import OuterProductMean
-from openfold.pair_transition import PairTransition
-from openfold.triangular_attention import (
+from .outer_product_mean import OuterProductMean
+from .pair_transition import PairTransition
+from .triangular_attention import (
     TriangleAttentionStartingNode,
     TriangleAttentionEndingNode,
 )
-from openfold.triangular_multiplicative_update import (
+from .triangular_multiplicative_update import (
     TriangleMultiplicationOutgoing,
     TriangleMultiplicationIncoming,
 )
-from openfold.checkpointing import checkpoint_blocks, get_checkpoint_fn
-from openfold.tensor_utils import chunk_layer
+from .checkpointing import checkpoint_blocks, get_checkpoint_fn
+from .tensor_utils import chunk_layer
 
 
 class MSATransition(nn.Module):
diff --git a/autochunk/openfold/msa.py b/tests/test_autochunk/openfold/msa.py
similarity index 98%
rename from autochunk/openfold/msa.py
rename to tests/test_autochunk/openfold/msa.py
index 00b822e7f390..7c137286feab 100644
--- a/autochunk/openfold/msa.py
+++ b/tests/test_autochunk/openfold/msa.py
@@ -18,15 +18,15 @@
 import torch.nn as nn
 from typing import Optional, List, Tuple
 
-from openfold.primitives import (
+from .primitives import (
     Linear, 
     LayerNorm,
     Attention, 
     GlobalAttention, 
     _attention_chunked_trainable,
 )
-from openfold.checkpointing import get_checkpoint_fn
-from openfold.tensor_utils import (
+from .checkpointing import get_checkpoint_fn
+from .tensor_utils import (
     chunk_layer,
     permute_final_dims,
     flatten_final_dims,
diff --git a/autochunk/openfold/outer_product_mean.py b/tests/test_autochunk/openfold/outer_product_mean.py
similarity index 97%
rename from autochunk/openfold/outer_product_mean.py
rename to tests/test_autochunk/openfold/outer_product_mean.py
index 43d853833c66..daadf1c272cf 100644
--- a/autochunk/openfold/outer_product_mean.py
+++ b/tests/test_autochunk/openfold/outer_product_mean.py
@@ -19,8 +19,8 @@
 import torch
 import torch.nn as nn
 
-from openfold.primitives import Linear
-from openfold.tensor_utils import chunk_layer
+from .primitives import Linear
+from .tensor_utils import chunk_layer
 
 
 class OuterProductMean(nn.Module):
diff --git a/autochunk/openfold/pair_transition.py b/tests/test_autochunk/openfold/pair_transition.py
similarity index 96%
rename from autochunk/openfold/pair_transition.py
rename to tests/test_autochunk/openfold/pair_transition.py
index de76306418ee..7d09914dc3cc 100644
--- a/autochunk/openfold/pair_transition.py
+++ b/tests/test_autochunk/openfold/pair_transition.py
@@ -17,8 +17,8 @@
 import torch
 import torch.nn as nn
 
-from openfold.primitives import Linear, LayerNorm
-from openfold.tensor_utils import chunk_layer
+from .primitives import Linear, LayerNorm
+from .tensor_utils import chunk_layer
 
 
 class PairTransition(nn.Module):
diff --git a/autochunk/openfold/primitives.py b/tests/test_autochunk/openfold/primitives.py
similarity index 99%
rename from autochunk/openfold/primitives.py
rename to tests/test_autochunk/openfold/primitives.py
index bbc156f21d4a..32a9d487c441 100644
--- a/autochunk/openfold/primitives.py
+++ b/tests/test_autochunk/openfold/primitives.py
@@ -21,8 +21,8 @@
 import torch
 import torch.nn as nn
 
-from openfold.checkpointing import get_checkpoint_fn
-from openfold.tensor_utils import (
+from .checkpointing import get_checkpoint_fn
+from .tensor_utils import (
     permute_final_dims,
     flatten_final_dims,
     _chunk_slice,
diff --git a/autochunk/openfold/tensor_utils.py b/tests/test_autochunk/openfold/tensor_utils.py
similarity index 100%
rename from autochunk/openfold/tensor_utils.py
rename to tests/test_autochunk/openfold/tensor_utils.py
diff --git a/autochunk/openfold/triangular_attention.py b/tests/test_autochunk/openfold/triangular_attention.py
similarity index 97%
rename from autochunk/openfold/triangular_attention.py
rename to tests/test_autochunk/openfold/triangular_attention.py
index 6d3e37f4c681..12d09c502daf 100644
--- a/autochunk/openfold/triangular_attention.py
+++ b/tests/test_autochunk/openfold/triangular_attention.py
@@ -20,8 +20,8 @@
 import torch
 import torch.nn as nn
 
-from openfold.primitives import Linear, LayerNorm, Attention
-from openfold.tensor_utils import (
+from .primitives import Linear, LayerNorm, Attention
+from .tensor_utils import (
     chunk_layer,
     permute_final_dims,
     flatten_final_dims,
diff --git a/autochunk/openfold/triangular_multiplicative_update.py b/tests/test_autochunk/openfold/triangular_multiplicative_update.py
similarity index 97%
rename from autochunk/openfold/triangular_multiplicative_update.py
rename to tests/test_autochunk/openfold/triangular_multiplicative_update.py
index 2406e2bac2cf..29f7062c3212 100644
--- a/autochunk/openfold/triangular_multiplicative_update.py
+++ b/tests/test_autochunk/openfold/triangular_multiplicative_update.py
@@ -19,8 +19,8 @@
 import torch
 import torch.nn as nn
 
-from openfold.primitives import Linear, LayerNorm
-from openfold.tensor_utils import permute_final_dims
+from .primitives import Linear, LayerNorm
+from .tensor_utils import permute_final_dims
 
 
 class TriangleMultiplicativeUpdate(nn.Module):

From 06a5355d98c0069e3305679a04846637917078e9 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 11:44:01 +0800
Subject: [PATCH 084/209] update test

---
 tests/test_autochunk/autochunk_test.py | 111 ++++++++++++-------------
 1 file changed, 52 insertions(+), 59 deletions(-)

diff --git a/tests/test_autochunk/autochunk_test.py b/tests/test_autochunk/autochunk_test.py
index 5e9aaca15f9f..caa2d9a80254 100644
--- a/tests/test_autochunk/autochunk_test.py
+++ b/tests/test_autochunk/autochunk_test.py
@@ -1,76 +1,60 @@
-import copy
-import torch
-import torch.nn.functional as F
 import pytest
+import torch
 import torch.fx
 import torch.multiprocessing as mp
-from torch.fx import GraphModule
-from colossalai.fx import ColoTracer
+
 import colossalai
-from colossalai.utils import free_port
+from colossalai.autochunk.chunk_codegen import ChunkCodeGen
 from colossalai.core import global_context as gpc
+from colossalai.fx import ColoTracer
 from colossalai.fx.graph_module import ColoGraphModule
-from colossalai.fx.passes.meta_info_prop import MetaInfoProp, TensorMetadata
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 from colossalai.fx.profiler import MetaTensor
+from colossalai.utils import free_port
 from tests.test_autochunk.evoformer.evoformer import evoformer_base
-from ...colossalai.autochunk.chunk_codegen import ChunkCodeGen
-with_codegen = True
-
-
-def _is_all_gradient_close(m: torch.nn.Module, gm: GraphModule) -> bool:
-    for m_p, gm_p in zip(m.parameters(), gm.parameters()):
-        if m_p.grad is not None and not torch.allclose(m_p.grad, gm_p.grad):
-            return False
-    return True
-
-
-def _is_all_param_close(m: torch.nn.Module, gm: GraphModule) -> bool:
-    for m_p, gm_p in zip(m.parameters(), gm.parameters()):
-        if m_p.grad is not None and not torch.allclose(m_p.data, gm_p.data):
-            return False
-    return True
-
-
-def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
-    # now_mem = torch.cuda.memory_allocated() / 1024**2
-    # with torch.no_grad():
-    #     node0 = node.clone()
-    #     pair0 = pair.clone()
-    #     model.graph(node0, pair0, now_mem)        
-    # new_now_mem = torch.cuda.memory_allocated() / 1024**2
-    # new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
-    # print("\ncode now:%.2f max:%.2f" %(new_now_mem - now_mem, new_max_mem - now_mem))
-    
+
+
+def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
     torch.cuda.reset_peak_memory_stats()
     now_mem = torch.cuda.memory_allocated() / 1024**2
     with torch.no_grad():
         node1 = node.clone()
         pair1 = pair.clone()
-        gm(node1, pair1)        
+        gm(node1, pair1)
     new_now_mem = torch.cuda.memory_allocated() / 1024**2
     new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
-    print("gm now:%.2f max:%.2f" %(new_now_mem - now_mem, new_max_mem - now_mem))
-            
+    print(
+        "autochunk now mem:%.2f max mem:%.2f"
+        % (new_now_mem - now_mem, new_max_mem - now_mem)
+    )
+
     # test forward
     with torch.no_grad():
         non_fx_out = model(node, pair)
         fx_out = gm(node, pair)
 
-    assert torch.allclose(non_fx_out[0], fx_out[0], atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(torch.abs(non_fx_out[0] - fx_out[0]))
-    assert torch.allclose(non_fx_out[1], fx_out[1], atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(torch.abs(non_fx_out[1] - fx_out[1]))
-
-    # test barckward
-    # loss0 = non_fx_out[0].sum() + non_fx_out[1].sum()
-    # loss0.backward()
-    # loss1 = fx_out[0].sum() + fx_out[1].sum()
-    # loss1.backward()
-    # assert _is_all_param_close(model, gm)
-    # assert _is_all_gradient_close(model, gm), "gm doesn't have the same gradient as original one"
+    assert torch.allclose(
+        non_fx_out[0], fx_out[0], atol=1e-4
+    ), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
+        torch.abs(non_fx_out[0] - fx_out[0])
+    )
+    assert torch.allclose(
+        non_fx_out[1], fx_out[1], atol=1e-4
+    ), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
+        torch.abs(non_fx_out[1] - fx_out[1])
+    )
 
 
 def _run_offload_codegen(rank):
     # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
-    colossalai.launch(config={}, rank=rank, world_size=1, host='localhost', port=free_port(), backend='nccl')
+    colossalai.launch(
+        config={},
+        rank=rank,
+        world_size=1,
+        host="localhost",
+        port=free_port(),
+        backend="nccl",
+    )
 
     # build model and input
     model = evoformer_base().cuda()
@@ -78,15 +62,25 @@ def _run_offload_codegen(rank):
     pair = torch.randn(1, 300, 300, 128).cuda()
 
     # trace the module and replace codegen
-    graph = ColoTracer().trace(model, meta_args={'node': node.to(torch.device('meta')), 'pair': pair.to(torch.device('meta'))})
-    gm_prop = torch.fx.symbolic_trace(model) # must use symbolic_trace
-    interp = MetaInfoProp(gm_prop) 
-    interp.propagate(MetaTensor(node, fake_device='cuda:0'), MetaTensor(pair, fake_device='cuda:0'))
+    graph = ColoTracer().trace(
+        model,
+        meta_args={
+            "node": node.to(torch.device("meta")),
+            "pair": pair.to(torch.device("meta")),
+        },
+    )
+    gm_prop = torch.fx.symbolic_trace(model)  # must use symbolic_trace
+    interp = MetaInfoProp(gm_prop)
+    interp.propagate(
+        MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
+    )
 
     # now run it twice to get meta info in graph module, not necessary
     gm = torch.fx.GraphModule(model, graph)
     interp = MetaInfoProp(gm)
-    interp.propagate(MetaTensor(node, fake_device='cuda:0'), MetaTensor(pair, fake_device='cuda:0'))
+    interp.propagate(
+        MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
+    )
 
     codegen = ChunkCodeGen(gm_prop)
     graph.set_codegen(codegen)
@@ -94,15 +88,14 @@ def _run_offload_codegen(rank):
     gm.recompile()
 
     # assert we have all the components
-    code = graph.python_code("self").src
-    print(code)
+    # code = graph.python_code("self").src
+    # print(code)
 
-    _test_fwd_and_bwd(model, gm, node, pair)
+    _test_fwd(model, gm, node, pair)
     gpc.destroy()
 
 
-@pytest.mark.skipif(not with_codegen, reason='torch version is lower than 1.12.0')
-def test_act_ckpt_codegen():
+def test_autochunk():
     mp.spawn(_run_offload_codegen, nprocs=1)
 
 
From d1f07731824c425c26197c7c82425445c8c3df3e Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 11:48:33 +0800
Subject: [PATCH 085/209] rename

---
 .../{autochunk_benchmark.py => benchmark_autochunk.py}            | 0
 tests/test_autochunk/{autochunk_test.py => test_autochunk.py}     | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename tests/test_autochunk/{autochunk_benchmark.py => benchmark_autochunk.py} (100%)
 rename tests/test_autochunk/{autochunk_test.py => test_autochunk.py} (100%)

diff --git a/tests/test_autochunk/autochunk_benchmark.py b/tests/test_autochunk/benchmark_autochunk.py
similarity index 100%
rename from tests/test_autochunk/autochunk_benchmark.py
rename to tests/test_autochunk/benchmark_autochunk.py
diff --git a/tests/test_autochunk/autochunk_test.py b/tests/test_autochunk/test_autochunk.py
similarity index 100%
rename from tests/test_autochunk/autochunk_test.py
rename to tests/test_autochunk/test_autochunk.py

From 1a6d2a740be33d769111ed03104bb5fa73b2ad50 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 14:14:45 +0800
Subject: [PATCH 086/209] take apart chunk code gen

---
 colossalai/autochunk/autochunk_codegen.py   |  497 ++++
 colossalai/autochunk/chunk_codegen.py       | 2364 -------------------
 colossalai/autochunk/chunk_region_search.py |  211 ++
 colossalai/autochunk/chunk_selector.py      |  221 ++
 colossalai/autochunk/index_tracer.py        | 1056 +++++++++
 colossalai/autochunk/memory_estiamtor.py    |  318 +++
 colossalai/autochunk/utils.py               |   95 +
 tests/test_autochunk/benchmark_autochunk.py |   12 +-
 tests/test_autochunk/test_autochunk.py      |    4 +-
 9 files changed, 2408 insertions(+), 2370 deletions(-)
 create mode 100644 colossalai/autochunk/autochunk_codegen.py
 delete mode 100644 colossalai/autochunk/chunk_codegen.py
 create mode 100644 colossalai/autochunk/chunk_region_search.py
 create mode 100644 colossalai/autochunk/chunk_selector.py
 create mode 100644 colossalai/autochunk/index_tracer.py
 create mode 100644 colossalai/autochunk/memory_estiamtor.py
 create mode 100644 colossalai/autochunk/utils.py

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
new file mode 100644
index 000000000000..58a8c375136e
--- /dev/null
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -0,0 +1,497 @@
+from typing import Any, Callable, Dict, Iterable, List, Tuple
+
+import torch
+from torch.fx.graph import (
+    CodeGen,
+    PythonCode,
+    _custom_builtins,
+    _CustomBuiltin,
+    _format_target,
+    _is_from_torch,
+    _Namespace,
+    _origin_type_map,
+    inplace_methods,
+    magic_methods,
+)
+from torch.fx.node import Argument, Node, _get_qualified_name, _type_repr, map_arg
+
+import colossalai
+
+from .chunk_region_search import ChunkRegionSearch
+from .utils import delete_free_var_from_last_use, find_idx_by_name, get_node_shape
+
+CODEGEN_AVAILABLE = True
+__all__ = ["AutoChunkCodeGen"]
+
+
+def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
+    new_shape = "["
+    for idx, i in enumerate(shape):
+        if idx == chunk_dim:
+            new_shape += "%s:%s + chunk_size" % (chunk_idx_name, chunk_idx_name)
+        else:
+            new_shape += ":"
+        new_shape += ", "
+    new_shape = new_shape[:-2] + "]"
+    return new_shape
+
+
+def _gen_loop_start(chunk_input, chunk_output, chunk_ouput_dim, chunk_size=2):
+    input_node = chunk_input[0]
+    out_shape = get_node_shape(chunk_output)
+    out_str = str(list(out_shape))
+    context = (
+        "chunk_result = torch.empty(%s, dtype=%s.dtype, device=%s.device); chunk_size = %d\nfor chunk_idx in range"
+        % (out_str, input_node.name, input_node.name, chunk_size)
+    )
+    context += "(0, %d, chunk_size):\n" % (out_shape[chunk_ouput_dim])
+    return context
+
+
+def _gen_loop_end(
+    chunk_inputs, chunk_non_compute_inputs, chunk_outputs, chunk_outputs_dim, node_list
+):
+    chunk_outputs_name = chunk_outputs.name
+    chunk_outputs_idx = find_idx_by_name(chunk_outputs_name, node_list)
+    chunk_output_shape = chunk_outputs.meta["tensor_meta"].shape
+    chunk_slice = _gen_chunk_slice_dim(
+        chunk_outputs_dim, "chunk_idx", chunk_output_shape
+    )
+    context = "    chunk_result%s = %s;  %s = None\n" % (
+        chunk_slice,
+        chunk_outputs_name,
+        chunk_outputs_name,
+    )
+    context += (
+        chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None"
+    )
+
+    # determine if its the last use for chunk input
+    for chunk_input in chunk_inputs + chunk_non_compute_inputs:
+        if all(
+            [
+                find_idx_by_name(user.name, node_list) <= chunk_outputs_idx
+                for user in chunk_input.users.keys()
+            ]
+        ):
+            context += ";  %s = None" % chunk_input.name
+
+    context += "\n"
+    return context
+
+
+def _replace_name(context, name_from, name_to):
+    patterns = [(" ", " "), (" ", "."), (" ", ","), ("(", ")"), ("(", ","), (" ", ")")]
+    for p in patterns:
+        source = p[0] + name_from + p[1]
+        target = p[0] + name_to + p[1]
+        if source in context:
+            context = context.replace(source, target)
+    return context
+
+
+def _replace_reshape_size(context, node_name, reshape_size_dict):
+    if node_name not in reshape_size_dict:
+        return context
+    for size_name, size_value in reshape_size_dict[node_name].items():
+        context = context.replace(size_name, size_value)
+    return context
+
+
+def emit_code_with_chunk(
+    body,
+    nodes,
+    emit_node_func,
+    delete_unused_value_func,
+    chunk_region_search,
+    chunk_infos,
+):
+    """Emit code with nested activation checkpoint
+    When we detect some of the node.activation_checkpoint is a List, we will use
+    this function to emit the activation checkpoint codes.
+
+    Args:
+        body: forward code
+        ckpt_func: checkpoint functions code
+        nodes: graph.nodes
+        emit_node_func: function to emit node
+        delete_unused_value_func: function to remove the unused value
+    """
+    node_list = list(nodes)
+
+    chunk_regions = [i["region"] for i in chunk_infos]
+    chunk_starts = [i[0] for i in chunk_regions]
+    chunk_ends = [i[1] for i in chunk_regions]
+
+    chunk_inputs = [i["inputs"] for i in chunk_infos]
+    chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos]
+    chunk_inputs_dim = [i["inputs_dim"] for i in chunk_infos]
+    chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [
+        j.name for i in chunk_inputs_non_chunk for j in i
+    ]
+
+    chunk_outputs = [i["outputs"][0] for i in chunk_infos]
+    chunk_outputs_dim = [i["outputs_dim"] for i in chunk_infos]
+
+    node_list = chunk_region_search.index_tracer.reorder_node_list(node_list)
+    node_idx = 0
+    region_idx = 0
+    within_chunk_region = False
+
+    while node_idx < len(node_list):
+        node = node_list[node_idx]
+
+        if node_idx in chunk_starts:
+            within_chunk_region = True
+            region_idx = chunk_starts.index(node_idx)
+            body.append(
+                _gen_loop_start(
+                    chunk_inputs[region_idx],
+                    chunk_outputs[region_idx],
+                    chunk_outputs_dim[region_idx],
+                    chunk_infos[region_idx]["chunk_size"],
+                )
+            )
+
+        if within_chunk_region:
+            emit_node_func(node, body)
+            # replace input var with chunk var
+            for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
+                for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
+                    if idx == node_idx:
+                        chunk_slice = _gen_chunk_slice_dim(
+                            dim[0], "chunk_idx", get_node_shape(input_node)
+                        )
+                        body[-1] = _replace_name(
+                            body[-1], input_node.name, input_node.name + chunk_slice
+                        )
+            # ones like
+            if "ones_like" in node.name:
+                meta_node = chunk_region_search.index_tracer.node_list[node_idx]
+                chunk_dim = chunk_infos[region_idx]["node_chunk_dim"][meta_node][
+                    "chunk_dim"
+                ]
+                if get_node_shape(meta_node)[chunk_dim] != 1:
+                    source_node = meta_node.args[0].args[0]
+                    if (
+                        source_node not in chunk_infos[region_idx]["node_chunk_dim"]
+                        or chunk_infos[region_idx]["node_chunk_dim"][source_node][
+                            "chunk_dim"
+                        ]
+                        is None
+                    ):
+                        chunk_slice = _gen_chunk_slice_dim(
+                            chunk_dim, "chunk_idx", get_node_shape(node)
+                        )
+                        body[-1] = _replace_name(
+                            body[-1], node.args[0].name, node.args[0].name + chunk_slice
+                        )
+            body[-1] = _replace_reshape_size(
+                body[-1], node.name, chunk_infos[region_idx]["reshape_size"]
+            )
+            body[-1] = "    " + body[-1]
+            delete_unused_value_func(node, body, chunk_inputs_names)
+        else:
+            emit_node_func(node, body)
+            if node_idx not in chunk_inputs:
+                delete_unused_value_func(node, body, chunk_inputs_names)
+
+        if node_idx in chunk_ends:
+            body.append(
+                _gen_loop_end(
+                    chunk_inputs[region_idx],
+                    chunk_inputs_non_chunk[region_idx],
+                    chunk_outputs[region_idx],
+                    chunk_outputs_dim[region_idx],
+                    node_list,
+                )
+            )
+            within_chunk_region = False
+
+        node_idx += 1
+
+
+if CODEGEN_AVAILABLE:
+
+    class AutoChunkCodeGen(CodeGen):
+        def __init__(self, meta_graph, max_memory=None):
+            super().__init__()
+            self.meta_graph = meta_graph
+            self.max_memory = max_memory
+            self.meta_node = list(meta_graph.graph.nodes)
+            # find the chunk regions
+            self.chunk_region_search = ChunkRegionSearch(meta_graph, max_memory)
+            self.chunk_infos = self.chunk_region_search.search_region()
+
+        def _gen_python_code(
+            self, nodes, root_module: str, namespace: _Namespace
+        ) -> PythonCode:
+            free_vars: List[str] = []
+            body: List[str] = []
+            globals_: Dict[str, Any] = {}
+            wrapped_fns: Dict[str, None] = {}
+
+            # Wrap string in list to pass by reference
+            maybe_return_annotation: List[str] = [""]
+
+            def add_global(name_hint: str, obj: Any):
+                """Add an obj to be tracked as a global.
+
+                We call this for names that reference objects external to the
+                Graph, like functions or types.
+
+                Returns: the global name that should be used to reference 'obj' in generated source.
+                """
+                if (
+                    _is_from_torch(obj) and obj != torch.device
+                ):  # to support registering torch.device
+                    # HACK: workaround for how torch custom ops are registered. We
+                    # can't import them like normal modules so they must retain their
+                    # fully qualified name.
+                    return _get_qualified_name(obj)
+
+                # normalize the name hint to get a proper identifier
+                global_name = namespace.create_name(name_hint, obj)
+
+                if global_name in globals_:
+                    assert globals_[global_name] is obj
+                    return global_name
+                globals_[global_name] = obj
+                return global_name
+
+            # set _custom_builtins here so that we needn't import colossalai in forward
+            _custom_builtins["colossalai"] = _CustomBuiltin(
+                "import colossalai", colossalai
+            )
+
+            # Pre-fill the globals table with registered builtins.
+            for name, (_, obj) in _custom_builtins.items():
+                add_global(name, obj)
+
+            def type_repr(o: Any):
+                if o == ():
+                    # Empty tuple is used for empty tuple type annotation Tuple[()]
+                    return "()"
+
+                typename = _type_repr(o)
+
+                if hasattr(o, "__origin__"):
+                    # This is a generic type, e.g. typing.List[torch.Tensor]
+                    origin_type = _origin_type_map.get(o.__origin__, o.__origin__)
+                    origin_typename = add_global(_type_repr(origin_type), origin_type)
+
+                    if hasattr(o, "__args__"):
+                        # Assign global names for each of the inner type variables.
+                        args = [type_repr(arg) for arg in o.__args__]
+
+                        if len(args) == 0:
+                            # Bare type, such as `typing.Tuple` with no subscript
+                            # This code-path used in Python < 3.9
+                            return origin_typename
+
+                        return f'{origin_typename}[{",".join(args)}]'
+                    else:
+                        # Bare type, such as `typing.Tuple` with no subscript
+                        # This code-path used in Python 3.9+
+                        return origin_typename
+
+                # Common case: this is a regular module name like 'foo.bar.baz'
+                return add_global(typename, o)
+
+            def _format_args(
+                args: Tuple[Argument, ...], kwargs: Dict[str, Argument]
+            ) -> str:
+                def _get_repr(arg):
+                    # Handle NamedTuples (if it has `_fields`) via add_global.
+                    if isinstance(arg, tuple) and hasattr(arg, "_fields"):
+                        qualified_name = _get_qualified_name(type(arg))
+                        global_name = add_global(qualified_name, type(arg))
+                        return f"{global_name}{repr(tuple(arg))}"
+                    return repr(arg)
+
+                args_s = ", ".join(_get_repr(a) for a in args)
+                kwargs_s = ", ".join(f"{k} = {_get_repr(v)}" for k, v in kwargs.items())
+                if args_s and kwargs_s:
+                    return f"{args_s}, {kwargs_s}"
+                return args_s or kwargs_s
+
+            # Run through reverse nodes and record the first instance of a use
+            # of a given node. This represents the *last* use of the node in the
+            # execution order of the program, which we will use to free unused
+            # values
+            node_to_last_use: Dict[Node, Node] = {}
+            user_to_last_uses: Dict[Node, List[Node]] = {}
+
+            def register_last_uses(n: Node, user: Node):
+                if n not in node_to_last_use:
+                    node_to_last_use[n] = user
+                    user_to_last_uses.setdefault(user, []).append(n)
+
+            for node in reversed(nodes):
+                map_arg(node.args, lambda n: register_last_uses(n, node))
+                map_arg(node.kwargs, lambda n: register_last_uses(n, node))
+
+            delete_free_var_from_last_use(user_to_last_uses)
+
+            # NOTE: we add a variable to distinguish body and ckpt_func
+            def delete_unused_values(user: Node, body, to_keep=[]):
+                """
+                Delete values after their last use. This ensures that values that are
+                not used in the remainder of the code are freed and the memory usage
+                of the code is optimal.
+                """
+                if user.op == "placeholder":
+                    return
+                if user.op == "output":
+                    body.append("\n")
+                    return
+                nodes_to_delete = user_to_last_uses.get(user, [])
+                nodes_to_delete = [i for i in nodes_to_delete if i.name not in to_keep]
+                if len(nodes_to_delete):
+                    to_delete_str = " = ".join(
+                        [repr(n) for n in nodes_to_delete] + ["None"]
+                    )
+                    body.append(f";  {to_delete_str}\n")
+                else:
+                    body.append("\n")
+
+            # NOTE: we add a variable to distinguish body and ckpt_func
+            def emit_node(node: Node, body):
+                maybe_type_annotation = (
+                    "" if node.type is None else f" : {type_repr(node.type)}"
+                )
+                if node.op == "placeholder":
+                    assert isinstance(node.target, str)
+                    maybe_default_arg = (
+                        "" if not node.args else f" = {repr(node.args[0])}"
+                    )
+                    free_vars.append(
+                        f"{node.target}{maybe_type_annotation}{maybe_default_arg}"
+                    )
+                    raw_name = node.target.replace("*", "")
+                    if raw_name != repr(node):
+                        body.append(f"{repr(node)} = {raw_name}\n")
+                    return
+                elif node.op == "call_method":
+                    assert isinstance(node.target, str)
+                    body.append(
+                        f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}"
+                        f"({_format_args(node.args[1:], node.kwargs)})"
+                    )
+                    return
+                elif node.op == "call_function":
+                    assert callable(node.target)
+                    # pretty print operators
+                    if (
+                        node.target.__module__ == "_operator"
+                        and node.target.__name__ in magic_methods
+                    ):
+                        assert isinstance(node.args, tuple)
+                        body.append(
+                            f"{repr(node)}{maybe_type_annotation} = "
+                            f"{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}"
+                        )
+                        return
+
+                    # pretty print inplace operators; required for jit.script to work properly
+                    # not currently supported in normal FX graphs, but generated by torchdynamo
+                    if (
+                        node.target.__module__ == "_operator"
+                        and node.target.__name__ in inplace_methods
+                    ):
+                        body.append(
+                            f"{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  "
+                            f"{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}"
+                        )
+                        return
+
+                    qualified_name = _get_qualified_name(node.target)
+                    global_name = add_global(qualified_name, node.target)
+                    # special case for getattr: node.args could be 2-argument or 3-argument
+                    # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
+                    if (
+                        global_name == "getattr"
+                        and isinstance(node.args, tuple)
+                        and isinstance(node.args[1], str)
+                        and node.args[1].isidentifier()
+                        and len(node.args) == 2
+                    ):
+                        body.append(
+                            f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}"
+                        )
+                        return
+                    body.append(
+                        f"{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})"
+                    )
+                    if node.meta.get("is_wrapped", False):
+                        wrapped_fns.setdefault(global_name)
+                    return
+                elif node.op == "call_module":
+                    assert isinstance(node.target, str)
+                    body.append(
+                        f"{repr(node)}{maybe_type_annotation} = "
+                        f"{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})"
+                    )
+                    return
+                elif node.op == "get_attr":
+                    assert isinstance(node.target, str)
+                    body.append(
+                        f"{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}"
+                    )
+                    return
+                elif node.op == "output":
+                    if node.type is not None:
+                        maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
+                    body.append(self.generate_output(node.args[0]))
+                    return
+                raise NotImplementedError(f"node: {node.op} {node.target}")
+
+            # Modified for activation checkpointing
+            ckpt_func = []
+
+            # if any node has a list of labels for activation_checkpoint, we
+            # will use nested type of activation checkpoint codegen
+            emit_code_with_chunk(
+                body,
+                nodes,
+                emit_node,
+                delete_unused_values,
+                self.chunk_region_search,
+                self.chunk_infos,
+            )
+
+            if len(body) == 0:
+                # If the Graph has no non-placeholder nodes, no lines for the body
+                # have been emitted. To continue to have valid Python code, emit a
+                # single pass statement
+                body.append("pass\n")
+
+            if len(wrapped_fns) > 0:
+                wrap_name = add_global("wrap", torch.fx.wrap)
+                wrap_stmts = "\n".join(
+                    [f'{wrap_name}("{name}")' for name in wrapped_fns]
+                )
+            else:
+                wrap_stmts = ""
+
+            if self._body_transformer:
+                body = self._body_transformer(body)
+
+            for name, value in self.additional_globals():
+                add_global(name, value)
+
+            # as we need colossalai.utils.checkpoint, we need to import colossalai
+            # in forward function
+            prologue = self.gen_fn_def(free_vars, maybe_return_annotation[0])
+            prologue = "".join(ckpt_func) + prologue
+            prologue = prologue
+
+            code = "".join(body)
+            code = "\n".join("    " + line for line in code.split("\n"))
+            fn_code = f"""
+{wrap_stmts}
+
+{prologue}
+{code}"""
+            # print(fn_code)
+            return PythonCode(fn_code, globals_)
diff --git a/colossalai/autochunk/chunk_codegen.py b/colossalai/autochunk/chunk_codegen.py
deleted file mode 100644
index 7a5d06689247..000000000000
--- a/colossalai/autochunk/chunk_codegen.py
+++ /dev/null
@@ -1,2364 +0,0 @@
-import colossalai
-import torch
-import copy
-from typing import List, Callable, Any, Tuple, Dict, Iterable
-
-from torch.fx.node import Node, Argument, map_arg, _type_repr, _get_qualified_name
-from torch.fx.graph import (
-    _Namespace,
-    PythonCode,
-    _custom_builtins,
-    _is_from_torch,
-    _format_target,
-    magic_methods,
-    CodeGen,
-    _origin_type_map,
-    inplace_methods,
-    _CustomBuiltin,
-)
-from colossalai.fx.profiler import (
-    calculate_fwd_out,
-    calculate_fwd_tmp,
-    parameter_size,
-    activation_size,
-)
-
-CODEGEN_AVAILABLE = True
-__all__ = ["ChunkCodeGen"]
-
-
-def _delete_free_var_from_last_use(user_to_last_uses):
-    for key, value in user_to_last_uses.items():
-        for n in value:
-            if n.op == "placeholder":
-                user_to_last_uses[key].remove(n)
-
-
-def _get_node_shape(node):
-    if hasattr(node.meta["tensor_meta"], "shape"):
-        return node.meta["tensor_meta"].shape
-    return None
-
-
-def _is_non_compute_node(node):
-    if any(i in node.op for i in ["placeholder", "get_attr", "output"]) or any(
-        i in node.name for i in ["getitem", "getattr"]
-    ):
-        return True
-    return False
-
-
-def _is_non_compute_node_except_placeholder(node):
-    if any(i in node.op for i in ["get_attr", "output"]) or any(
-        i in node.name for i in ["getitem", "getattr"]
-    ):
-        return True
-    return False
-
-
-def _is_non_compute_node_except_placeholder_output(node):
-    if any(i in node.op for i in ["get_attr"]) or any(
-        i in node.name for i in ["getitem", "getattr"]
-    ):
-        return True
-    return False
-
-
-class IndexTracer(object):
-    def __init__(self, node_list) -> None:
-        self.node_list = node_list
-        self.idx_trace_list = self._init_idx_trace_list()
-        self.idx_trace_equal = []
-        self.idx_view_list = {}
-        self.idx_count = -1
-        self.all_reorder_map = {i: i for i in range(len(self.idx_trace_list))}
-
-    def _init_idx_trace_list(self):
-        idx_trace_list = []
-        for n in self.node_list:
-            if _get_node_shape(n) != None:
-                cur_trace = {
-                    "idx": [None for _ in range(len(_get_node_shape(n)))],
-                    "compute": [[] for _ in range(len(_get_node_shape(n)))],
-                    "source": [{} for _ in range(len(_get_node_shape(n)))],
-                }
-            else:
-                cur_trace = {"idx": [], "compute": [], "source": []}
-            idx_trace_list.append(cur_trace)
-        return idx_trace_list
-
-    def _add_index(self):
-        """
-        Update the count and return it. To record the idx number.
-
-        Returns:
-            idx_count: int
-        """
-        self.idx_count += 1
-        return self.idx_count
-
-    def _del_dim(self, idx, dim_idx):
-        self.idx_trace_list[idx]["idx"].pop(dim_idx)
-        self.idx_trace_list[idx]["compute"].pop(dim_idx)
-        self.idx_trace_list[idx]["source"].pop(dim_idx)
-
-    def _add_dim(self, node_idx, dim_idx):
-        self.idx_trace_list[node_idx]["idx"].insert(dim_idx, self._add_index())
-        self.idx_trace_list[node_idx]["compute"].insert(dim_idx, [])
-        self.idx_trace_list[node_idx]["source"].insert(dim_idx, {})
-
-    def _transform_index(self, node, node_dim):
-        node_idx = self._find_idx_trace_from_node(node)
-        dims = list(range(len(node_idx)))
-        return dims[node_dim]
-
-    def _inherit_index(self, node_from, node_from_dim, node_to, node_to_dim):
-        node_from_dim = self._transform_index(node_from, node_from_dim)
-        node_to_dim = self._transform_index(node_to, node_to_dim)
-        node_from_trace = self._find_trace_from_node(node_from)
-        node_to_trace = self._find_trace_from_node(node_to)
-        node_to_trace["idx"][node_to_dim] = node_from_trace["idx"][node_from_dim]
-        node_to_trace["compute"][node_to_dim] = copy.deepcopy(
-            node_from_trace["compute"][node_from_dim]
-        )
-        self._add_source(node_from, node_from_dim, node_to, node_to_dim, init=True)
-
-    def _inherit_all_computation(self, node_from, node_to):
-        node_from_compute = self._find_compute_trace_from_node(node_from)
-        node_to_compute = self._find_compute_trace_from_node(node_to)
-        assert len(node_from_compute) == len(node_to_compute)
-        for i in range(len(node_from_compute)):
-            self._add_source(node_from, i, node_to, i)
-            node_to_compute[i] = copy.deepcopy(node_from_compute[i])
-
-    def _add_source(self, node_from, node_from_dim, node_to, node_to_dim, init=False):
-        node_from_dim = self._transform_index(node_from, node_from_dim)
-        node_from_trace_source = self._find_source_trace_from_node(node_from)
-        node_to_dim = self._transform_index(node_to, node_to_dim)
-        node_to_trace_source = self._find_source_trace_from_node(node_to)
-        node_from_idx = _find_idx_by_name(node_from.name, self.node_list)
-        if init:
-            node_to_trace_source[node_to_dim] = {}
-        # add dim to cur new source
-        if node_from_idx not in node_to_trace_source[node_to_dim]:
-            node_to_trace_source[node_to_dim][node_from_idx] = [node_from_dim]
-        else:
-            if node_from_dim not in node_to_trace_source[node_to_dim][node_from_idx]:
-                node_to_trace_source[node_to_dim][node_from_idx].append(node_from_dim)
-        # update inputs source
-        for node_idx, node_dim in node_from_trace_source[node_from_dim].items():
-            if node_idx not in node_to_trace_source[node_to_dim]:
-                node_to_trace_source[node_to_dim][node_idx] = copy.deepcopy(node_dim)
-            else:
-                for d in node_dim:
-                    if d not in node_to_trace_source[node_to_dim][node_idx]:
-                        node_to_trace_source[node_to_dim][node_idx].append(d)
-
-    def _mark_computation_from_node(self, node_from, node_to, exclude=None):
-        if exclude == None:
-            exclude = []
-        else:
-            exclude = [self._transform_index(node_to, i) for i in exclude]
-        node_from_compute = self._find_compute_trace_from_node(node_from)
-        node_to_compute = self._find_compute_trace_from_node(node_to)
-        # assert len(node_from_compute) == len(node_to_compute)
-        for i in range(-1, -min(len(node_from_compute), len(node_to_compute)) - 1, -1):
-            if self._transform_index(node_to, i) in exclude:
-                continue
-            self._add_source(node_from, i, node_to, i)
-            for j in node_from_compute[i]:
-                if j not in node_to_compute[i]:
-                    node_to_compute[i].append(j)
-
-    def _mark_idx_equal(self, node1, dim1, node2, dim2):
-        """
-        Mark 2 index to be equal.
-
-        Args:
-            idx1 (int): index count.
-            idx2 (int): index count.
-        """
-        # node1_idx = _find_idx_by_name(node1.name, self.nodes_list)
-        # node2_idx = _find_idx_by_name(node2.name, self.nodes_list)
-        # if node1_idx > node2_idx:
-        #     self._add_source(node2, dim2, node1, dim1)
-        # else:
-        #     self._add_source(node1, dim1, node2, dim2)
-
-    def _mark_computation(self, node, idx, dim):
-        """
-        Mark some dims of node as computed.
-
-        Args:
-            node (node)
-            idx (int): node index
-            dim (list or int): dims to be marked as computed
-        """
-        if isinstance(dim, int):
-            dim = [dim]
-        dims = list(range(len(_get_node_shape(node))))
-        for d in dim:
-            cur_dim = dims[d]
-            if idx not in self.idx_trace_list[idx]["compute"][cur_dim]:
-                self.idx_trace_list[idx]["compute"][cur_dim].append(idx)
-
-    def _find_trace_from_node(self, node):
-        """
-        Find node idx and compute trace by the node.
-
-        Args:
-            node (node)
-        Returns:
-            idx (list): idx of the node
-            compute (list): computed idx of the node.
-        """
-        node_idx = _find_idx_by_name(node.name, self.node_list)
-        node_dict = self.idx_trace_list[node_idx]
-        return node_dict
-
-    def _find_source_trace_from_node(self, node):
-        """
-        Find node source trace by the node.
-
-        Args:
-            node (node)
-        Returns:
-            idx (list): idx of the node
-            compute (list): computed idx of the node.
-        """
-        node_idx = _find_idx_by_name(node.name, self.node_list)
-        node_dict = self.idx_trace_list[node_idx]
-        return node_dict["source"]
-
-    def _find_idx_trace_from_node(self, node):
-        """
-        Find node idx trace by the node.
-
-        Args:
-            node (node)
-        Returns:
-            idx (list): idx of the node
-        """
-        node_idx = _find_idx_by_name(node.name, self.node_list)
-        return self.idx_trace_list[node_idx]["idx"]
-
-    def _find_compute_trace_from_node(self, node):
-        """
-        Find node compute trace by the node.
-
-        Args:
-            node (node)
-        Returns:
-            compute (list): computed idx of the node.
-        """
-        node_idx = _find_idx_by_name(node.name, self.node_list)
-        return self.idx_trace_list[node_idx]["compute"]
-
-    def _assign_index_as_input(self, node, node_idx, input_node=None):
-        """
-        Assign node's trace as its input node.
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        if input_node == None:
-            input_node = node.args[0]
-        input_node_idx = _find_idx_by_name(input_node.name, self.node_list)
-        input_node_idx_trace = self.idx_trace_list[input_node_idx]["idx"]
-
-        new_idx_trace = copy.deepcopy(input_node_idx_trace)
-        self.idx_trace_list[node_idx]["idx"] = new_idx_trace
-
-        self._inherit_all_computation(input_node, node)
-
-    def _assign_all_index(self, node, node_idx):
-        """
-        Add new index for all node's dims.
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        shape = node.meta["tensor_meta"].shape
-        new_trace = []
-        for _ in shape:
-            new_trace.append(self._add_index())
-        self.idx_trace_list[node_idx]["idx"] = new_trace
-
-    def _assign_transpose_index(self, node, node_idx):
-        """
-        Assign index for transpose op.
-        1. swap input's dim according to transpose args
-        2. inherit input's computation
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        input_node = node.args[0]
-        tranpose_dim = node.args[1:]
-
-        self._assign_index_as_input(node, node_idx, input_node)
-        self._inherit_index(input_node, tranpose_dim[1], node, tranpose_dim[0])
-        self._inherit_index(input_node, tranpose_dim[0], node, tranpose_dim[1])
-
-    def _assign_permute_index(self, node, node_idx):
-        """
-        Assign index for permute op.
-        1. swap input's dim according to permute args
-        2. inherit input's computation
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        permute_dim = node.args[1:]
-        input_node = node.args[0]
-
-        self._assign_index_as_input(node, node_idx, input_node)
-        for idx, d in enumerate(permute_dim):
-            self._inherit_index(input_node, d, node, idx)
-
-    def _assign_linear_index(self, node, node_idx):
-        """
-        Assign index for linear op.
-        1. copy trace from input node and change last index accroding to weight
-        2. mark equal for input node last index, weight first dim and bias dim.
-        3. inherit input's computation, mark computation for last dim.
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        if len(node.args) == 2:
-            input_node, weight = node.args
-            bias = None
-        else:
-            input_node, weight, bias = node.args
-
-        self._assign_index_as_input(node, node_idx)
-        self._inherit_index(weight, 1, node, -1)
-
-        self._mark_computation(node, node_idx, [-1])
-        self._mark_idx_equal(input_node, -1, weight, 0)
-
-        if bias:
-            self._mark_idx_equal(input_node, -1, bias, 0)
-
-    def _assign_matmul_index(self, node, node_idx):
-        """
-        Assign index for matmul op.
-        1. copy trace from matmul_left and change last index accroding to matmul_right. (assert they have same length)
-        2. mark equal for input matmul_left -1 index and matmul_right -2 dim.
-        3. inherit matmul_left and matmul_right computation, mark computation for last dim.
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        matmul_left, matmul_right = node.args
-
-        assert len(_get_node_shape(matmul_left)) == len(_get_node_shape(matmul_right))
-        self._assign_index_as_input(node, node_idx, matmul_left)
-        self._inherit_index(matmul_right, -1, node, -1)
-
-        self._mark_computation_from_node(matmul_right, node, [-1, -2])
-        self._mark_computation(node, node_idx, [-1])
-        self._mark_idx_equal(matmul_left, -1, matmul_right, -2)
-
-    def _assign_layernorm_index(self, node, idx):
-        """
-        Assign index for layernorm op.
-        1. assign index as input node
-        2. inherit computation and mark last 2 dims as computed.
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        self._assign_index_as_input(node, idx)
-        self._mark_computation(node, idx, [-1])
-
-    def _assign_elementwise_index(self, node, idx):
-        """
-        Assign index for element-wise op (eg. relu sigmoid add mul).
-        1. assign index as input node
-        2. inherit computation from all input nodes.
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        self._assign_index_as_input(node, idx)
-        nodes_in = []
-        for node_in in node.args:
-            if type(node_in) == type(node):
-                nodes_in.append(node_in)
-                self._mark_computation_from_node(node_in, node)
-        assert len(nodes_in) <= 2
-        if len(nodes_in) == 2:
-            node_in0_shape = _get_node_shape(nodes_in[0])
-            node_in1_shape = _get_node_shape(nodes_in[1])
-            for i in range(-1, -min(len(node_in0_shape), len(node_in1_shape)) - 1, -1):
-                if node_in0_shape[i] == node_in1_shape[i]:
-                    self._mark_idx_equal(nodes_in[0], i, nodes_in[1], i)
-
-    def _assgin_no_change_index(self, node, idx):
-        self._assign_index_as_input(node, idx)
-        for node_in in node.args:
-            if type(node_in) == type(node):
-                self._mark_computation_from_node(node_in, node)
-
-    def _assign_einsum_index(self, node, idx):
-        """
-        Assign index for einsum op.
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        patterns = node.args[0]
-        input_nodes = node.args[1:]
-
-        patterns = patterns.replace(" ", "")
-        left, right = patterns.split("->")
-        left = left.split(",")
-
-        all_index = []
-        for i in left:
-            for c in i:
-                all_index.append(c)
-        all_index = set(all_index)
-        free_index = set([i for i in right])
-        sum_index = all_index - free_index
-
-        for right_idx, right_indice in enumerate(right):
-            for left_idx, left_str in enumerate(left):
-                if right_indice in left_str:
-                    source_idx = left_str.index(right_indice)
-                    self._inherit_index(
-                        input_nodes[left_idx], source_idx, node, right_idx
-                    )
-
-        # for i in sum_index:
-        #     for left_idx, left_str in enumerate(left):
-        #         if i in left_str:
-        #             self._mark_computation(node, idx, left_str.index(i))
-        #             break
-
-    def _assign_softmax_index(self, node, idx):
-        """
-        Assign index for softmax op.
-        1. assign index as input node
-        2. inherit computation and mark softmax dim as computed.
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        self._assign_index_as_input(node, idx)
-        self._mark_computation(node, idx, [node.kwargs["dim"]])
-
-    def _assign_unsqueeze_index(self, node, node_idx):
-        """
-        Assign index for unsqueeze op.
-        1. assign new index for unsqueeze dim
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        self._del_dim(node_idx, -1)
-        self._assign_index_as_input(node, node_idx)
-        self._add_dim(node_idx, node.args[1])
-
-    def _assign_dropout_index(self, node, node_idx):
-        """
-        Assign index for unsqueeze op.
-        1. assign new index for unsqueeze dim
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        self._assign_index_as_input(node, node_idx)
-
-    def _assign_ones_like_index(self, node, node_idx):
-        """
-        Assign index for oneslike op.
-        1. assign new index for all dim
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        self._assign_all_index(node, node_idx)
-
-    def _assign_view_reshape_index(self, node, node_idx):
-        """
-        Assign index for view and reshape op.
-        1. get origin shape and target shape by meta info.
-        2. compute the real value of -1 in target shape.
-        3. determine changed dim, and assgin index for generated dim.
-        4. log changed dim and generated dim for restore
-        5. inherit computation.
-        6. TODO: look into view list to see whether the view is associated with other,
-           if so assgin equal dim according to previous view.
-
-        Args:
-            node (node)
-            node_idx (int)
-        """
-        # get data, turn into number
-        origin_node = node.args[0]
-        origin_shape = origin_node.meta["tensor_meta"].shape
-        target_shape = []
-        for i in range(1, len(node.args)):
-            if isinstance(node.args[i], int):
-                target_shape.append(node.args[i])
-            else:
-                target_shape.append(node.args[i].meta["fwd_out"][0])
-
-        # compute the value of -1
-        if -1 in target_shape:
-            origin_product = 1
-            for i in origin_shape:
-                origin_product *= i
-            target_product = -1
-            for i in target_shape:
-                target_product *= i
-            shape_idx = target_shape.index(-1)
-            target_shape[shape_idx] = origin_product // target_product
-
-        # determine changed dim
-        len_diff = len(origin_shape) - len(target_shape)
-        if len_diff == 1:
-            # dim merge
-            dim_equal = [i == j for i, j in zip(origin_shape[:-1], target_shape)]
-            dim_to = [dim_equal.index(False)]
-            dim_from = [dim_equal.index(False), dim_equal.index(False) + 1]
-            self._add_dim(node_idx, -1)
-        elif len_diff == -1:
-            # dim expand
-            dim_equal = [i == j for i, j in zip(origin_shape, target_shape[:-1])]
-            dim_from = [dim_equal.index(False)]
-            dim_to = [dim_equal.index(False), dim_equal.index(False) + 1]
-            self._del_dim(node_idx, -1)
-        else:
-            raise NotImplementedError(
-                "shape"
-                + str(origin_shape)
-                + "and"
-                + str(target_shape)
-                + "view not implemented"
-            )
-
-        # get new index
-        origin_trace = self._find_idx_trace_from_node(origin_node)
-        self._assign_index_as_input(node, node_idx, origin_node)
-        dim_from.reverse()
-        for i in dim_from:
-            self._del_dim(node_idx, i)
-        for i in dim_to:
-            self._add_dim(node_idx, i)
-
-        # inherit computation
-        compute_log = self._find_compute_trace_from_node(origin_node)
-        for i in dim_from:
-            if origin_trace[i] in compute_log:
-                for j in dim_to:
-                    self._mark_computation(node, node_idx, [j])
-                break
-
-        # log view, not used now
-        view_dict = {
-            "idx_from": [origin_trace[i] for i in dim_from],
-            "dim_from": dim_from,
-            "idx_to": [self.idx_trace_list[node_idx]["idx"][i] for i in dim_to],
-            "dim_to": dim_to,
-        }
-        self.idx_view_list[node] = view_dict
-
-    def _merge_equal_idx(self):
-        idx_equal = copy.deepcopy(self.idx_trace_equal)
-        idx_equal.reverse()
-        for idx in idx_equal:
-            merge_to = min(idx)
-            merge_from = max(idx)
-            for trace in self.idx_trace_list:
-                if merge_from in trace["idx"]:
-                    trace["idx"] = [
-                        merge_to if i == merge_from else i for i in trace["idx"]
-                    ]
-
-    def trace_index(self):
-        for idx, node in enumerate(self.node_list):
-            if node.op == "placeholder":
-                self._assign_all_index(node, idx)
-            elif node.op == "call_method":
-                if "transpose" in node.name:
-                    self._assign_transpose_index(node, idx)
-                elif "permute" in node.name:
-                    self._assign_permute_index(node, idx)
-                elif "view" in node.name or "reshape" in node.name:
-                    self._assign_view_reshape_index(node, idx)
-                elif "unsqueeze" in node.name:
-                    self._assign_unsqueeze_index(node, idx)
-                elif any(i in node.name for i in ["to", "contiguous"]):
-                    self._assgin_no_change_index(node, idx)
-                else:
-                    raise NotImplementedError(node.name, "method not implemented yet!")
-            elif node.op == "call_function":
-                if "linear" in node.name:
-                    self._assign_linear_index(node, idx)
-                elif "matmul" in node.name:
-                    self._assign_matmul_index(node, idx)
-                elif "softmax" in node.name:
-                    self._assign_softmax_index(node, idx)
-                elif any(n in node.name for n in ["mul", "add", "sigmoid", "relu"]):
-                    self._assign_elementwise_index(node, idx)
-                elif "ones_like" in node.name:
-                    self._assign_ones_like_index(node, idx)
-                elif "dropout" in node.name:
-                    self._assign_dropout_index(node, idx)
-                elif "einsum" in node.name:
-                    self._assign_einsum_index(node, idx)
-                elif "getattr" in node.name:
-                    continue  # get attr like shape
-                elif "getitem" in node.name:
-                    continue  # get item in list
-                else:
-                    raise NotImplementedError(
-                        node.name, "function not implemented yet!"
-                    )
-            elif node.op == "call_module":
-                if any(n in node.name for n in ["layernorm", "norm"]):
-                    self._assign_layernorm_index(node, idx)
-                else:
-                    raise NotImplementedError(node.name, "module not implemented yet!")
-            elif node.op == "get_attr":
-                self._assign_all_index(node, idx)  # get param
-            elif node.op == "output":
-                continue
-            else:
-                raise NotImplementedError(node.op, "op not implemented yet!")
-        # self._merge_equal_idx()
-
-    def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node):
-        """
-        Check 2 given index: one index should be source of the other
-        Args:
-            start_idx(int): start node chunk dim
-            start_node(node): start node
-            end_idx(int): end node chunk dim
-            end_node(node): end node
-
-        Returns:
-            bool: True if check pass
-        """
-        start_node_idx = _find_idx_by_name(start_node.name, self.node_list)
-        end_node_trace = self._find_trace_from_node(end_node)
-        end_node_trace_source = end_node_trace["source"][end_dim]
-        sorted_source = sorted(
-            end_node_trace_source.items(), key=lambda d: d[0], reverse=True
-        )
-        for node_idx, node_dim in sorted_source:
-            if node_idx == start_node_idx and start_dim in node_dim:
-                return True
-            # it means we meet a node outside the loop, and the node is not input node
-            if node_idx < start_idx:
-                return False
-        return False
-
-    def check_index_compute(self, start_idx, end_dim, end_node, end_idx):
-        """
-        Check 2 given index: check they haven't been computed in the source trace.
-        Args:
-            start_idx(int): start node chunk dim
-            start_node(node): start node
-            end_idx(int): end node chunk dim
-            end_node(node): end node
-
-        Returns:
-            bool: True if check pass
-        """
-        end_node_trace = self._find_trace_from_node(end_node)
-        end_node_compute = end_node_trace["compute"][end_dim]
-        if any(start_idx <= i <= end_idx for i in end_node_compute):
-            return False
-        return True
-
-    def get_node_chunk_dim(self, node_from, node_from_dim, node_to):
-        node_from_source = self._find_source_trace_from_node(node_from)
-        dim_source = node_from_source[node_from_dim]
-        node_to_idx = _find_idx_by_name(node_to.name, self.node_list)
-        for k, v in dim_source.items():
-            if k == node_to_idx:
-                return v
-        return None
-
-    def _find_inherit_dim(self, input_node, input_dim, node):
-        input_node_idx = _find_idx_by_name(input_node.name, self.node_list)
-        node_trace_source = self._find_source_trace_from_node(node)
-        for node_dim in range(len(_get_node_shape(node))):
-            if (
-                input_node_idx in node_trace_source[node_dim]
-                and input_dim[0] in node_trace_source[node_dim][input_node_idx]
-            ):
-                return node_dim
-        return None
-
-    def check_index_duplicate(self, chunk_infos, return_dim=False):
-        input_dim_after_node = {}
-        for input_node_idx, input_node in enumerate(chunk_infos["inputs"]):
-            for k, v in chunk_infos["inputs_dim"][input_node_idx].items():
-                inherit_dim = self._find_inherit_dim(input_node, v, self.node_list[k])
-                if inherit_dim:
-                    input_dim_after_node[k] = inherit_dim
-
-        for node in self.node_list[
-            chunk_infos["region"][0] : chunk_infos["region"][1] + 1
-        ]:
-            if _is_non_compute_node_except_placeholder(node):
-                continue
-            count = 0
-            duplicate_dims = []
-            node_trace_source = self._find_source_trace_from_node(node)
-            for node_dim in range(len(_get_node_shape(node))):
-                duplicate_dim = []
-                duplicate_flag = False
-                dim_source = node_trace_source[node_dim]
-                for k, v in dim_source.items():
-                    if chunk_infos["region"][0] <= k <= chunk_infos["region"][1]:
-                        if k in input_dim_after_node and input_dim_after_node[k] in v:
-                            duplicate_flag = True
-                            duplicate_dim.append((k, v))
-                duplicate_dims.append(duplicate_dim)
-                if duplicate_flag:
-                    count += 1
-
-            if count > 1:
-                if return_dim:
-                    return False, duplicate_dims
-                else:
-                    return False
-        if return_dim:
-            return True, None
-        else:
-            return True
-
-    def _assgin_single_node_flow(
-        self,
-        arg_node,
-        start_idx,
-        end_idx,
-        cur_node_dim,
-        cur_node_compute,
-        cur_node_source,
-        cur_node_fix_dim,
-        all_node_info,
-        next_node_list,
-    ):
-        arg_idx = _find_idx_by_name(arg_node.name, self.node_list)
-        # arg in chunk range or be inputs
-        if not (start_idx <= arg_idx < end_idx):
-            return True
-
-        # find arg dim
-        if cur_node_dim is not None:
-            # dim is computed
-            if arg_idx in cur_node_compute[cur_node_dim]:
-                return False
-            if arg_idx not in cur_node_source[cur_node_dim]:
-                arg_dim = None
-            else:
-                arg_dim = cur_node_source[cur_node_dim][arg_idx][0]
-        else:
-            arg_dim = None
-
-        # get fix dim
-        arg_fix_dim = []
-        if cur_node_dim is not None:
-            for i in cur_node_fix_dim:
-                fix_dim_source = cur_node_source[i]
-                if arg_idx in fix_dim_source:
-                    arg_fix_dim.append(fix_dim_source[arg_idx][0])
-
-        # if already in node_info, arg dim must be same
-        if arg_node in all_node_info:
-            if all_node_info[arg_node]["chunk_dim"] != arg_dim:
-                return False
-            all_node_info[arg_node]["fix_dim"] = list(
-                set(all_node_info[arg_node]["fix_dim"] + arg_fix_dim)
-            )
-        # else add it to list
-        else:
-            all_node_info[arg_node] = {"chunk_dim": arg_dim, "fix_dim": arg_fix_dim}
-
-        next_node_list.append(arg_node)
-        return True
-
-    def flow_search(self, start_idx, start_dim, end_idx, end_dim):
-        inputs, outputs = _find_chunk_compute_input_and_output_nodes(
-            self.node_list[start_idx : end_idx + 1]
-        )
-        # only single ouput
-        if len(outputs) > 1:
-            return None
-
-        cur_node_list = [self.node_list[end_idx]]  # start from the last node
-        all_node_info = {cur_node_list[0]: {"chunk_dim": end_dim, "fix_dim": []}}
-
-        while len(cur_node_list) > 0:
-            next_node_list = []
-
-            for cur_node in cur_node_list:
-                # get cur node info
-                cur_node_chunk_dim = all_node_info[cur_node]["chunk_dim"]
-                cur_node_fix_dim = all_node_info[cur_node]["fix_dim"]
-                cur_node_idx = _find_idx_by_name(cur_node.name, self.node_list)
-                if cur_node_chunk_dim:
-                    cur_node_compute = self._find_compute_trace_from_node(cur_node)
-                    cur_node_source = self._find_source_trace_from_node(cur_node)
-                else:
-                    cur_node_compute = cur_node_source = None
-
-                # get all valid args
-                arg_list = []
-                for arg in cur_node.args:
-                    if type(arg) != type(cur_node):
-                        continue
-                    if _is_non_compute_node(arg):
-                        continue
-                    arg_list.append(arg)
-                    flow_flag = self._assgin_single_node_flow(
-                        arg,
-                        start_idx,
-                        end_idx,
-                        cur_node_chunk_dim,
-                        cur_node_compute,
-                        cur_node_source,
-                        cur_node_fix_dim,
-                        all_node_info,
-                        next_node_list,
-                    )
-                    if flow_flag == False:
-                        return None
-
-                if len(arg_list) == 2:
-                    if any(i in cur_node.name for i in ["add", "mul"]):
-                        for arg in arg_list:
-                            if not (
-                                start_idx
-                                <= _find_idx_by_name(arg.name, self.node_list)
-                                < end_idx
-                            ):
-                                continue
-                            arg_chunk_dim = all_node_info[arg]["chunk_dim"]
-                            arg_fix_dim = all_node_info[arg]["fix_dim"]
-                            arg_shape = _get_node_shape(arg)
-                            # add all dim as fix dim except chunk dim
-                            for i, shape in enumerate(arg_shape):
-                                if shape != 1 and i != cur_node_chunk_dim:
-                                    if i == arg_chunk_dim:
-                                        return None
-                                    if i not in arg_fix_dim:
-                                        arg_fix_dim.append(i)
-                    elif "einsum" in cur_node.name:
-                        pass
-                    elif "matmul" in cur_node.name:
-                        pass
-                    else:
-                        raise NotImplementedError()
-            cur_node_list = next_node_list
-
-        inputs_dim = []
-        remove_inputs = []
-        for input_node in inputs:
-            input_dict = {}
-            input_node_idx = _find_idx_by_name(input_node.name, self.node_list)
-            for user in input_node.users.keys():
-                if _is_non_compute_node(user):
-                    continue
-                user_idx = _find_idx_by_name(user.name, self.node_list)
-                if start_idx <= user_idx <= end_idx:
-                    chunk_dim = all_node_info[user]["chunk_dim"]
-                    if chunk_dim is not None:
-                        user_source = self._find_source_trace_from_node(user)[chunk_dim]
-                        if input_node_idx in user_source:
-                            input_dict[user_idx] = user_source[input_node_idx]
-                        else:
-                            return None
-            if len(input_dict) == 0:
-                remove_inputs.append(input_node)
-            else:
-                inputs_dim.append(input_dict)
-        for i in remove_inputs:
-            if i in inputs:
-                inputs.remove(i)
-
-        chunk_info = {
-            "region": (start_idx, end_idx),
-            "inputs": inputs,
-            "inputs_non_chunk": [],
-            "inputs_dim": inputs_dim,
-            "outputs": outputs,
-            "outputs_dim": end_dim,
-            "node_chunk_dim": all_node_info,
-            "args": {},
-        }
-
-        # move useless nodes ahead of loop
-        # get all possible prepose nodes
-        maybe_prepose_nodes = []
-        for node, node_info in all_node_info.items():
-            if node_info["chunk_dim"] is None:
-                maybe_prepose_nodes.append(node)
-        maybe_prepose_nodes.sort(
-            key=lambda x: _find_idx_by_name(x.name, self.node_list),
-            reverse=True,
-        )  # from last node to first node
-        prepose_nodes = []
-        # set every node as root, search its args, if all legal, turn root and args as prepose nodes
-        while len(maybe_prepose_nodes) > 0:
-            tmp_cur_prepose_nodes = [maybe_prepose_nodes[0]]
-            tmp_cur_related_prepose_nodes = []
-            prepose_flag = True
-
-            # loop cur node's all arg until out of chunk
-            while len(tmp_cur_prepose_nodes) > 0:
-                if prepose_flag == False:
-                    break
-                tmp_next_prepose_nodes = []
-                tmp_cur_related_prepose_nodes.extend(tmp_cur_prepose_nodes)
-                for cur_prepose_node in tmp_cur_prepose_nodes:
-                    if prepose_flag == False:
-                        break
-                    for cur_prepose_node_arg in cur_prepose_node.args:
-                        if type(cur_prepose_node_arg) != type(cur_prepose_node):
-                            continue
-                        # out of loop
-                        if not (
-                            start_idx
-                            <= _find_idx_by_name(
-                                cur_prepose_node_arg.name, self.node_list
-                            )
-                            < end_idx
-                        ):
-                            continue
-                        # compute op in loop
-                        elif cur_prepose_node_arg in all_node_info:
-                            if all_node_info[cur_prepose_node_arg]["chunk_dim"] is None:
-                                tmp_next_prepose_nodes.append(cur_prepose_node_arg)
-                            else:
-                                prepose_flag = False
-                                break
-                        # non compute op
-                        else:
-                            tmp_next_prepose_nodes.append(cur_prepose_node_arg)
-                tmp_cur_prepose_nodes = tmp_next_prepose_nodes
-
-            if prepose_flag == False:
-                maybe_prepose_nodes.remove(maybe_prepose_nodes[0])
-                continue
-            else:
-                for n in tmp_cur_related_prepose_nodes:
-                    if n not in prepose_nodes:
-                        prepose_nodes.append(n)
-                    if n in maybe_prepose_nodes:
-                        maybe_prepose_nodes.remove(n)
-        # sort by index
-        prepose_nodes.sort(key=lambda x: _find_idx_by_name(x.name, self.node_list))
-        chunk_info["args"]["prepose_nodes"] = prepose_nodes
-
-        # we need to log input nodes to avoid deleteing them in the loop
-        chunk_node_list = self.node_list[start_idx : end_idx + 1]
-        # also need to get some prepose node's arg out of non_chunk_inputs
-        for n in prepose_nodes:
-            chunk_node_list.remove(n)
-        non_chunk_inputs = _find_chunk_all_input_nodes(chunk_node_list)
-        for i in non_chunk_inputs:
-            if i not in chunk_info["inputs"]:
-                chunk_info["inputs_non_chunk"].append(i)
-
-        # reassgin reshape size, some size may have changed due to chunk
-        chunk_info = self._reassgin_reshape_size(chunk_info)
-
-        return chunk_info
-
-    def _reassgin_reshape_size(self, chunk_info):
-        chunk_region = chunk_info["region"]
-        reshape_size = {}
-        chunk_shape = _get_node_shape(chunk_info["outputs"][0])[
-            chunk_info["outputs_dim"]
-        ]
-        for node in self.node_list[chunk_region[0] : chunk_region[1] + 1]:
-            if any(i in node.name for i in ["reshape", "view"]):
-                reshape_args = node.args[1:]
-                reshape_log = self.idx_view_list[node]
-                chunk_dim = chunk_info["node_chunk_dim"][node]["chunk_dim"]
-                reshape_size[node.name] = {}
-                for reshape_arg_dim, reshape_arg in enumerate(reshape_args):
-                    if reshape_arg_dim in reshape_log["dim_to"]:
-                        continue
-                    if reshape_arg_dim == chunk_dim:
-                        reshape_size[node.name][reshape_arg.name] = (
-                            "min(chunk_size, %d - chunk_idx)" % chunk_shape
-                        )
-        chunk_info["reshape_size"] = reshape_size
-        return chunk_info
-
-    def _get_reorder_map(self, chunk_info):
-        reorder_map = {i: i for i in range(len(self.node_list))}
-
-        chunk_region_start = chunk_info["region"][0]
-        chunk_region_end = chunk_info["region"][1]
-        chunk_prepose_nodes = chunk_info["args"]["prepose_nodes"]
-        chunk_prepose_nodes_idx = [
-            _find_idx_by_name(i.name, self.node_list) for i in chunk_prepose_nodes
-        ]
-        # put prepose nodes ahead
-        for idx, n in enumerate(chunk_prepose_nodes):
-            n_idx = chunk_prepose_nodes_idx[idx]
-            reorder_map[n_idx] = chunk_region_start + idx
-        # put other nodes after prepose nodes
-        for n in self.node_list[chunk_region_start : chunk_region_end + 1]:
-            if n in chunk_prepose_nodes:
-                continue
-            n_idx = _find_idx_by_name(n.name, self.node_list)
-            pos = sum([n_idx < i for i in chunk_prepose_nodes_idx])
-            reorder_map[n_idx] = n_idx + pos
-
-        return reorder_map
-
-    def _reorder_chunk_info(self, chunk_info, reorder_map):
-        # update chunk info
-        chunk_info["region"] = (
-            chunk_info["region"][0] + len(chunk_info["args"]["prepose_nodes"]),
-            chunk_info["region"][1],
-        )
-        new_inputs_dim = []
-        for idx, input_dim in enumerate(chunk_info["inputs_dim"]):
-            new_input_dim = {}
-            for k, v in input_dim.items():
-                new_input_dim[reorder_map[k]] = v
-            new_inputs_dim.append(new_input_dim)
-        chunk_info["inputs_dim"] = new_inputs_dim
-        return chunk_info
-
-    def _update_all_reorder_map(self, reorder_map):
-        for origin_idx, map_idx in self.all_reorder_map.items():
-            self.all_reorder_map[origin_idx] = reorder_map[map_idx]
-
-    def _reorder_self_node_list(self, reorder_map):
-        new_node_list = [None for _ in range(len(self.node_list))]
-        for old_idx, new_idx in reorder_map.items():
-            new_node_list[new_idx] = self.node_list[old_idx]
-        self.node_list = new_node_list
-
-    def _reorder_idx_trace(self, reorder_map):
-        # reorder list
-        new_idx_trace_list = [None for _ in range(len(self.idx_trace_list))]
-        for old_idx, new_idx in reorder_map.items():
-            new_idx_trace_list[new_idx] = self.idx_trace_list[old_idx]
-        self.idx_trace_list = new_idx_trace_list
-        # update compute
-        for idx_trace in self.idx_trace_list:
-            compute = idx_trace["compute"]
-            for dim_compute in compute:
-                for idx, i in enumerate(dim_compute):
-                    dim_compute[idx] = reorder_map[i]
-        # update source
-        for idx_trace in self.idx_trace_list:
-            source = idx_trace["source"]
-            for dim_idx, dim_source in enumerate(source):
-                new_dim_source = {}
-                for k, v in dim_source.items():
-                    new_dim_source[reorder_map[k]] = v
-                source[dim_idx] = new_dim_source
-
-    def reorder_all(self, chunk_info):
-        if chunk_info is None:
-            return chunk_info
-        if len(chunk_info["args"]["prepose_nodes"]) == 0:
-            return chunk_info
-        reorder_map = self._get_reorder_map(chunk_info)
-        self._update_all_reorder_map(reorder_map)
-        self._reorder_idx_trace(reorder_map)
-        self._reorder_self_node_list(reorder_map)
-        chunk_info = self._reorder_chunk_info(chunk_info, reorder_map)
-        return chunk_info
-
-    def reorder_node_list(self, node_list):
-        new_node_list = [None for _ in range(len(node_list))]
-        for old_idx, new_idx in self.all_reorder_map.items():
-            new_node_list[new_idx] = node_list[old_idx]
-        return new_node_list
-
-    def tmp_reorder(self, node_list, chunk_info):
-        if len(chunk_info["args"]["prepose_nodes"]) == 0:
-            return node_list, chunk_info
-        reorder_map = self._get_reorder_map(chunk_info)
-
-        # new tmp node list
-        new_node_list = [None for _ in range(len(node_list))]
-        for old_idx, new_idx in reorder_map.items():
-            new_node_list[new_idx] = node_list[old_idx]
-
-        chunk_info = self._reorder_chunk_info(chunk_info, reorder_map)
-        return new_node_list, chunk_info
-
-
-class MemoryEstimator(object):
-    def __init__(self, index_tracer: IndexTracer) -> None:
-        pass
-
-    def _get_meta_node_size(self, x):
-        x = x.meta["tensor_meta"]
-        x = x.numel * torch.tensor([], dtype=x.dtype).element_size()
-        return x
-
-    def _get_output_node(self, n):
-        fwd_out = {
-            x.uuid: x
-            for x in n.meta["fwd_out"]
-            if isinstance(x, torch.Tensor) and hasattr(x, "uuid")
-        }
-        out_size = activation_size(fwd_out)
-        out_node = [n.name] if out_size > 0 else []
-        # if any(i in n.name for i in ['transpose', 'permute', 'view']):
-        #     out_size = 0
-        return out_size, out_node
-
-    def _get_output_node_size(self, n):
-        return self._get_output_node(n)[0]
-
-    def _add_active_node(self, n, active_list):
-        new_active = self._get_output_node(n)[1]
-        if n.op == "placeholder":
-            new_active.append(n.name)
-        for i in new_active:
-            if i not in active_list:
-                active_list.append(i)
-
-    def _get_delete_node(self, user, user_to_last_uses, to_keep=None):
-        delete_size = 0
-        delete_node = []
-        if user.op not in ("output",):
-            nodes_to_delete = user_to_last_uses.get(user, [])
-            if to_keep is not None:
-                keep_list = []
-                for n in nodes_to_delete:
-                    if n.name in to_keep:
-                        keep_list.append(n)
-                for n in keep_list:
-                    if n in nodes_to_delete:
-                        nodes_to_delete.remove(n)
-            if len(nodes_to_delete):
-                out_node = [self._get_output_node(i) for i in nodes_to_delete]
-                delete_size = sum([i[0] for i in out_node])
-                for i in range(len(out_node)):
-                    if out_node[i][0] > 0:
-                        delete_node.append(out_node[i][1][0])
-                    elif nodes_to_delete[i].op == "placeholder":
-                        delete_node.append(nodes_to_delete[i].name)
-                    # elif any(j in nodes_to_delete[i].name for j in ['transpose', 'permute', 'view']):
-                    #     delete_node.append(nodes_to_delete[i].name)
-        return delete_size, delete_node
-
-    def _get_delete_node_size(self, user, user_to_last_uses, to_keep):
-        return self._get_delete_node(user, user_to_last_uses, to_keep)[0]
-
-    def _remove_deactive_node(self, user, user_to_last_uses, active_list):
-        delete_node = self._get_delete_node(user, user_to_last_uses)[1]
-        for i in delete_node:
-            if i in active_list:
-                active_list.remove(i)
-
-    def _get_chunk_inputs_size(
-        self, chunk_inputs, chunk_inputs_non_chunk, node_list, chunk_end_idx
-    ):
-        nodes_to_delete = []
-        for chunk_input in chunk_inputs + chunk_inputs_non_chunk:
-            chunk_input_users = chunk_input.users.keys()
-            chunk_input_users_idx = [
-                _find_idx_by_name(i.name, node_list) for i in chunk_input_users
-            ]
-            if all(i <= chunk_end_idx for i in chunk_input_users_idx):
-                if chunk_input not in nodes_to_delete:
-                    nodes_to_delete.append(chunk_input)
-        out_node = [self._get_output_node(i) for i in nodes_to_delete]
-        delete_size = sum([i[0] for i in out_node])
-        return delete_size
-
-    def _get_last_usr(self, nodes):
-        node_to_last_use: Dict[Node, Node] = {}
-        user_to_last_uses: Dict[Node, List[Node]] = {}
-
-        def register_last_uses(n: Node, user: Node):
-            if n not in node_to_last_use:
-                node_to_last_use[n] = user
-                user_to_last_uses.setdefault(user, []).append(n)
-
-        for node in reversed(nodes):
-            map_arg(node.args, lambda n: register_last_uses(n, node))
-            map_arg(node.kwargs, lambda n: register_last_uses(n, node))
-        return user_to_last_uses
-
-    def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
-        mem = 0
-        not_contiguous_ops = ["permute"]
-        inherit_contiguous_ops = ["transpose", "view"]
-
-        if node.op == "call_function" and any(
-            n in node.name for n in ["matmul", "reshape"]
-        ):
-            for n in node.args:
-                if n in not_contiguous_list:
-                    # matmul won't change origin tensor, but create a tmp copy
-                    mem += self._get_output_node_size(n)
-        elif node.op == "call_module":
-            for n in node.args:
-                if n in not_contiguous_list:
-                    # module will just make origin tensor to contiguous
-                    if delete:
-                        not_contiguous_list.remove(n)
-        elif node.op == "call_method" and any(
-            i in node.name for i in not_contiguous_ops
-        ):
-            if node not in not_contiguous_list:
-                not_contiguous_list.append(node)
-        return mem
-
-    def _get_chunk_ratio(self, node, chunk_node_dim, chunk_size):
-        if node not in chunk_node_dim:
-            return 1.0
-        node_shape = _get_node_shape(node)
-        chunk_dim = chunk_node_dim[node]["chunk_dim"]
-        if chunk_dim is None:
-            return 1.0
-        else:
-            return float(chunk_size) / node_shape[chunk_dim]
-
-    def _get_chunk_delete_node_size(
-        self, user, user_to_last_uses, chunk_ratio, chunk_inputs_names
-    ):
-        # if any(j in user.name for j in ['transpose', 'permute', 'view']):
-        #     return 0
-        if user.op in ("placeholder", "output"):
-            return 0
-        nodes_to_delete = user_to_last_uses.get(user, [])
-        delete_size = 0
-        for n in nodes_to_delete:
-            if n.name in chunk_inputs_names:
-                continue
-            delete_size += self._get_output_node_size(n) * chunk_ratio
-        return delete_size
-
-    def _print_mem_log(self, log, nodes, title=None):
-        if title:
-            print(title)
-        for idx, (l, n) in enumerate(zip(log, nodes)):
-            print("%s:%.2f \t" % (n.name, l), end="")
-            if (idx + 1) % 3 == 0:
-                print("")
-        print("\n")
-
-    def _print_compute_op_mem_log(self, log, nodes, title=None):
-        if title:
-            print(title)
-        for idx, (l, n) in enumerate(zip(log, nodes)):
-            if n.op in ["placeholder", "get_attr", "output"]:
-                continue
-            if any(i in n.name for i in ["getitem", "getattr"]):
-                continue
-            print("%s:%.2f \t" % (n.name, l), end="")
-            if (idx + 1) % 3 == 0:
-                print("")
-        print("\n")
-
-    def estimate_chunk_inference_mem(
-        self,
-        node_list,
-        chunk_infos=None,
-        print_mem=False,
-    ):
-        act_memory = 0.0
-        act_memory_peak_log = []
-        act_memory_after_node_log = []
-        active_node_list = []
-        active_node_list_log = []
-        not_contiguous_list = []
-        user_to_last_uses = self._get_last_usr(node_list)
-        user_to_last_uses_no_free_var = self._get_last_usr(node_list)
-        _delete_free_var_from_last_use(user_to_last_uses_no_free_var)
-
-        use_chunk = True if chunk_infos is not None else False
-        chunk_within = False
-        chunk_region_idx = None
-        chunk_ratio = 1  # use it to estimate chunk mem
-        chunk_inputs_names = []
-
-        if use_chunk:
-            chunk_regions = [i["region"] for i in chunk_infos]
-            chunk_starts = [i[0] for i in chunk_regions]
-            chunk_ends = [i[1] for i in chunk_regions]
-            chunk_inputs = [i["inputs"] for i in chunk_infos]
-            chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos]
-            chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [
-                j.name for i in chunk_inputs_non_chunk for j in i
-            ]
-            chunk_outputs = [i["outputs"][0] for i in chunk_infos]
-            chunk_node_dim = [i["node_chunk_dim"] for i in chunk_infos]
-            chunk_sizes = [
-                i["chunk_size"] if "chunk_size" in i else 1 for i in chunk_infos
-            ]
-
-        for idx, node in enumerate(node_list):
-            # if node in chunk start nodes, change chunk ratio and add chunk_tensor
-            if use_chunk and idx in chunk_starts:
-                chunk_within = True
-                chunk_region_idx = chunk_starts.index(idx)
-                act_memory += self._get_output_node_size(
-                    chunk_outputs[chunk_region_idx]
-                ) / (1024**2)
-
-            # determine chunk ratio for current node
-            if chunk_within:
-                chunk_ratio = self._get_chunk_ratio(
-                    node,
-                    chunk_node_dim[chunk_region_idx],
-                    chunk_sizes[chunk_region_idx],
-                )
-
-            # if node is placeholder, just add the size of the node
-            if node.op == "placeholder":
-                act_memory += self._get_meta_node_size(node) * chunk_ratio / (1024**2)
-                act_memory_peak_log.append(act_memory)
-            # skip output
-            elif node.op == "output":
-                continue
-            # no change for non compute node
-            elif _is_non_compute_node_except_placeholder(node):
-                act_memory_peak_log.append(act_memory)
-            # node is a compute op
-            # calculate tmp, output node and delete node memory
-            else:
-                # forward memory
-                # TODO: contiguous_memory still not accurate for matmul, view, reshape and transpose
-                act_memory += (
-                    self._get_contiguous_memory(node, not_contiguous_list)
-                    * chunk_ratio
-                    / (1024**2)
-                )
-                act_memory += (
-                    self._get_output_node_size(node) * chunk_ratio / (1024**2)
-                )
-                # record max act memory
-                act_memory_peak_log.append(act_memory)
-                # delete useless memory
-                act_memory -= (
-                    self._get_contiguous_memory(node, not_contiguous_list, delete=True)
-                    * chunk_ratio
-                    / (1024**2)
-                )
-                # delete unused vars not in chunk_input_list
-                # we can't delete input nodes until chunk ends
-                if chunk_within:
-                    act_memory -= self._get_chunk_delete_node_size(
-                        node,
-                        user_to_last_uses_no_free_var,
-                        chunk_ratio,
-                        chunk_inputs_names,
-                    ) / (1024**2)
-                else:
-                    act_memory -= self._get_delete_node_size(
-                        node, user_to_last_uses_no_free_var, chunk_inputs_names
-                    ) / (1024**2)
-
-            # log active node, only effective without chunk
-            self._add_active_node(node, active_node_list)
-            self._remove_deactive_node(node, user_to_last_uses, active_node_list)
-
-            # if node in chunk end nodes, restore chunk settings
-            if use_chunk and idx in chunk_ends:
-                act_memory -= (
-                    self._get_output_node_size(node) * chunk_ratio / (1024**2)
-                )
-                act_memory -= self._get_chunk_inputs_size(
-                    chunk_inputs[chunk_region_idx],
-                    chunk_inputs_non_chunk[chunk_region_idx],
-                    node_list,
-                    chunk_regions[chunk_region_idx][1],
-                ) / (1024**2)
-                chunk_within = False
-                chunk_ratio = 1
-                chunk_region_idx = None
-
-            act_memory_after_node_log.append(act_memory)
-            active_node_list_log.append(copy.deepcopy(active_node_list))
-
-        if print_mem:
-            print("with chunk" if use_chunk else "without chunk")
-            # self._print_mem_log(act_memory_peak_log, node_list, "peak")
-            # self._print_mem_log(act_memory_after_node_log, node_list, "after")
-            self._print_compute_op_mem_log(act_memory_peak_log, node_list, "peak")
-            # self._print_compute_op_mem_log(
-            #     act_memory_after_node_log, node_list, "after"
-            # )
-
-        # param_memory = parameter_size(gm)
-        # all_memory = act_memory + param_memory
-        return act_memory_peak_log, act_memory_after_node_log, active_node_list_log
-
-
-class ChunkSelector(object):
-    def __init__(
-        self,
-        index_tracer: IndexTracer,
-        memory_estimator: MemoryEstimator,
-        max_memory=None,
-    ):
-        self.index_tracer = index_tracer
-        self.memory_estimator = memory_estimator
-        if max_memory is not None:
-            self.stratge = "fit_memory"
-            self.max_memory = max_memory  # MB
-        else:
-            self.stratge = "min_memory"
-
-    def _select_best_chunk_region(
-        self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
-    ):
-        if self.stratge == "min_memory":
-            best_region = self._select_min_memory_chunk_region(
-                possible_chunk_regions,
-                chunk_infos,
-                peak_node,
-                max_chunk_region,
-                mem_peak,
-            )
-        elif self.stratge == "fit_memory":
-            best_region = self._select_fit_memory_chunk_region(
-                possible_chunk_regions,
-                chunk_infos,
-                peak_node,
-                max_chunk_region,
-                mem_peak,
-            )
-        else:
-            raise RuntimeError()
-        return best_region
-
-    def _select_fit_memory_chunk_region(
-        self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
-    ):
-        # stop chunk if max memory satisfy memory limit
-        if max(mem_peak) < self.max_memory:
-            return None
-
-        # remove illegal regions
-        illegal_regions = []
-        for i in possible_chunk_regions:
-            if not self._is_legal_region(i, chunk_infos):
-                illegal_regions.append(i)
-        for i in illegal_regions:
-            if i in possible_chunk_regions:
-                possible_chunk_regions.remove(i)
-
-        if len(possible_chunk_regions) == 0:
-            return None
-
-        # get mem for chunk region
-        regions_dict = []
-        for region in possible_chunk_regions:
-            cur_region = region.copy()
-            cur_node_list, cur_region = self.index_tracer.tmp_reorder(
-                self.index_tracer.node_list, cur_region
-            )
-            cur_chunk_infos = chunk_infos + [cur_region]
-            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
-                cur_node_list, cur_chunk_infos
-            )[0]
-            cur_chunk_region_peak = cur_mem_peak[
-                max_chunk_region[0] : max_chunk_region[1] + 1
-            ]
-            cur_chunk_region_max_peak = max(cur_chunk_region_peak)
-            if cur_chunk_region_max_peak < self.max_memory:
-                regions_dict.append(
-                    {
-                        "chunk_info": region,
-                        "chunk_max_mem": cur_chunk_region_max_peak,
-                        "chunk_len": self._get_compute_node_num(
-                            region["region"][0], region["region"][1]
-                        ),
-                        "reorder_chunk_info": cur_region,
-                        "reorder_node_list": cur_node_list,
-                    }
-                )
-        # no region found
-        if len(regions_dict) == 0:
-            raise RuntimeError("Search failed. Try a larger memory threshold.")
-
-        # select the min chunk len
-        chunk_len = [i["chunk_len"] for i in regions_dict]
-        best_region_idx = chunk_len.index(min(chunk_len))
-        best_region = regions_dict[best_region_idx]
-
-        # get max chunk size
-        best_region = self._get_fit_chunk_size(best_region, chunk_infos)
-        return best_region
-
-    def _get_fit_chunk_size(self, chunk_region_dict, chunk_infos):
-        chunk_size = 1
-        reorder_chunk_info = chunk_region_dict["reorder_chunk_info"]
-        reorder_chunk_info["chunk_size"] = chunk_size
-        cur_chunk_max_mem = 0
-        # search a region
-        while cur_chunk_max_mem < self.max_memory:
-            chunk_size *= 2
-            reorder_chunk_info["chunk_size"] = chunk_size
-            cur_chunk_infos = chunk_infos + [reorder_chunk_info]
-            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
-                chunk_region_dict["reorder_node_list"], cur_chunk_infos
-            )[0]
-            cur_chunk_max_mem = max(
-                cur_mem_peak[
-                    reorder_chunk_info["region"][0] : reorder_chunk_info["region"][1]
-                    + 1
-                ]
-            )
-        # search exact size
-        chunk_info = chunk_region_dict["chunk_info"]
-        chunk_info["chunk_size"] = self._chunk_size_binary_search(
-            chunk_size // 2, chunk_size, chunk_region_dict, chunk_infos
-        )
-        return chunk_info
-
-    def _chunk_size_binary_search(self, l, r, chunk_region_dict, chunk_infos):
-        if l >= 16:
-            gap = 4
-        else:
-            gap = 1
-        chunk_info = chunk_region_dict["reorder_chunk_info"]
-        while r >= l + gap:
-            mid = int((l + r) / 2 + 0.5)
-            chunk_info["chunk_size"] = mid
-            cur_chunk_infos = chunk_infos + [chunk_info]
-            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
-                chunk_region_dict["reorder_node_list"], cur_chunk_infos
-            )[0]
-            cur_chunk_max_mem = max(
-                cur_mem_peak[chunk_info["region"][0] : chunk_info["region"][1] + 1]
-            )
-            if cur_chunk_max_mem >= self.max_memory:
-                r = mid - gap
-            else:
-                l = mid + gap
-        return l
-
-    def _get_compute_node_num(self, start, end):
-        count = 0
-        for i in self.index_tracer.node_list[start : end + 1]:
-            if not _is_non_compute_node(i):
-                count += 1
-        return count
-
-    def _select_min_memory_chunk_region(
-        self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
-    ):
-        # remove illegal regions
-        illegal_regions = []
-        for i in possible_chunk_regions:
-            if not self._is_legal_region(i, chunk_infos):
-                illegal_regions.append(i)
-        for i in illegal_regions:
-            if i in possible_chunk_regions:
-                possible_chunk_regions.remove(i)
-
-        if len(possible_chunk_regions) == 0:
-            return None
-
-        # get mem for chunk region
-        regions_dict = []
-        for region in possible_chunk_regions:
-            cur_region = region.copy()
-            cur_node_list, cur_region = self.index_tracer.tmp_reorder(
-                self.index_tracer.node_list, cur_region
-            )
-            cur_chunk_infos = chunk_infos + [cur_region]
-            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
-                cur_node_list, cur_chunk_infos
-            )[0]
-            cur_chunk_region_peak = cur_mem_peak[
-                max_chunk_region[0] : max_chunk_region[1] + 1
-            ]
-            cur_chunk_region_max_peak = max(cur_chunk_region_peak)
-            regions_dict.append(
-                {
-                    "chunk_info": region,
-                    "chunk_max_mem": cur_chunk_region_max_peak,
-                    "chunk_len": self._get_compute_node_num(
-                        region["region"][0], region["region"][1]
-                    ),
-                    "reorder_chunk_info": cur_region,
-                    "reorder_node_list": cur_node_list,
-                }
-            )
-
-        # select the min mem
-        chunk_max_mem = [i["chunk_max_mem"] for i in regions_dict]
-        best_region_idx = chunk_max_mem.index(min(chunk_max_mem))
-        best_region = regions_dict[best_region_idx]["chunk_info"]
-        if best_region is not None:
-            best_region["chunk_size"] = 1
-        return best_region
-
-    def _is_legal_region(self, cur_chunk_info, chunk_infos):
-        (chunk_region_start, chunk_region_end) = cur_chunk_info["region"]
-        if cur_chunk_info in chunk_infos:
-            return False
-        if chunk_region_end < chunk_region_start:
-            return False
-        for i in chunk_infos:
-            region = i["region"]
-            if not (
-                (chunk_region_start > region[1] and chunk_region_end > region[1])
-                or (chunk_region_start < region[0] and chunk_region_end < region[0])
-            ):
-                return False
-        return True
-
-
-class ChunkRegionSearch(object):
-    def __init__(self, gm, max_memory=None) -> None:
-        self.gm = gm
-        self.index_tracer = IndexTracer(list(gm.graph.nodes))
-        self.index_tracer.trace_index()
-        self.memory_estimator = MemoryEstimator(self.index_tracer)
-        self.chunk_selector = ChunkSelector(
-            self.index_tracer, self.memory_estimator, max_memory=max_memory
-        )
-
-    def _find_peak_node(self, mem_peak):
-        max_value = max(mem_peak)
-        max_idx = mem_peak.index(max_value)
-        return max_idx
-
-    def _get_free_var(self):
-        free_var_idx = []
-        for idx, n in enumerate(self.index_tracer.node_list):
-            if n.op == "placeholder":
-                free_var_idx.append(idx)
-        return free_var_idx
-
-    def _get_min_free_var(self, active_node_list, free_vars):
-        min_len = 999
-        for idx, n in enumerate(active_node_list):
-            if idx in free_vars:
-                continue
-            if len(n) < min_len:
-                min_len = len(n)
-        return min_len
-
-    def _search_max_chunk_region(self, active_node, peak_node, chunk_regions):
-        free_vars = self._get_free_var()
-        free_var_num = len(free_vars)
-        active_node_num = [len(i) for i in active_node]
-        min_active_node_num = min(active_node_num[free_var_num:])
-        threshold = max(free_var_num, min_active_node_num)
-
-        # from peak_node to free_var
-        inside_flag = False
-        chunk_region_start = free_var_num
-        for i in range(peak_node, -1, -1):
-            if active_node_num[i] <= threshold:
-                inside_flag = True
-            if inside_flag and active_node_num[i] > threshold:
-                chunk_region_start = i + 1
-                break
-
-        # from peak_node to len-2
-        inside_flag = False
-        chunk_region_end = len(active_node) - 1
-        for i in range(peak_node, len(active_node)):
-            if active_node_num[i] <= threshold:
-                inside_flag = True
-            if inside_flag and active_node_num[i] > threshold:
-                chunk_region_end = i
-                break
-
-        for i in chunk_regions:
-            region = i["region"]
-            if chunk_region_start >= region[0] and chunk_region_end <= region[1]:
-                return None
-            elif (
-                region[0] <= chunk_region_start <= region[1]
-                and chunk_region_end > region[1]
-            ):
-                chunk_region_start = region[1] + 1
-            elif (
-                region[0] <= chunk_region_end <= region[1]
-                and chunk_region_start < region[0]
-            ):
-                chunk_region_end = region[0] - 1
-        return chunk_region_start, chunk_region_end
-
-    def _is_not_compute(self, trace, chunk_range, dim_idx):
-        if trace["idx"][dim_idx] not in trace["compute"]:
-            return True
-        if trace["idx"][dim_idx] in trace["compute"] and all(
-            i < chunk_range[0] or i > chunk_range[1]
-            for i in trace["compute"][trace["idx"][dim_idx]]
-        ):
-            return True
-        return False
-
-    def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
-        start_traces = input_trace[start_idx]
-        end_trace = output_trace[end_idx]
-        end_node = self.index_tracer.node_list[end_idx]
-        chunk_infos = []
-        for end_dim, _ in enumerate(end_trace["idx"]):
-            if len(start_traces) > 1:
-                continue
-            for start_node, start_trace in start_traces.items():
-                for start_dim, _ in enumerate(start_trace["idx"]):
-                    # dim size cannot be 1
-                    if (
-                        _get_node_shape(end_node)[end_dim] == 1
-                        or _get_node_shape(start_node)[start_dim] == 1
-                    ):
-                        continue
-                    # check index source align
-                    if not self.index_tracer.check_index_source(
-                        start_dim, start_node, start_idx, end_dim, end_node
-                    ):
-                        continue
-                    # check index copmute
-                    if not self.index_tracer.check_index_compute(
-                        start_idx, end_dim, end_node, end_idx
-                    ):
-                        continue
-                    # flow search
-                    chunk_info = self.index_tracer.flow_search(
-                        start_idx, start_dim, end_idx, end_dim
-                    )
-                    if chunk_info is None:
-                        continue
-                    # check index copmute
-                    if not self.index_tracer.check_index_duplicate(chunk_info):
-                        continue
-                    chunk_infos.append(chunk_info)
-        return chunk_infos
-
-    def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
-        possible_chunk_region = []
-        output_trace = copy.deepcopy(self.index_tracer.idx_trace_list)
-        input_trace = []  # trace of a node's input nodes
-        for _, n in enumerate(self.index_tracer.node_list):
-            cur_trace = {}
-            for arg in n.args:
-                if type(arg) == type(n) and not _is_non_compute_node_except_placeholder(
-                    arg
-                ):
-                    cur_trace[arg] = self.index_tracer._find_trace_from_node(arg)
-            input_trace.append(cur_trace)
-
-        for start_idx in range(max_chunk_region[0], peak_node + 1):
-            for end_idx in range(peak_node, max_chunk_region[1] + 1):
-                # skip non compute nodes
-                if _is_non_compute_node(
-                    self.index_tracer.node_list[start_idx]
-                ) or _is_non_compute_node(self.index_tracer.node_list[end_idx]):
-                    continue
-
-                # select free dim
-                chunk_info = self._find_free_dim(
-                    input_trace, output_trace, start_idx, end_idx
-                )
-                if len(chunk_info) > 0:
-                    possible_chunk_region.extend(chunk_info)
-        return possible_chunk_region
-
-    def _step_search(self, mem_peak, active_node, chunk_regions):
-        peak_node = self._find_peak_node(mem_peak)
-        max_chunk_region = self._search_max_chunk_region(
-            active_node, peak_node, chunk_regions
-        )
-        if max_chunk_region == None:
-            return None
-        possible_chunk_regions = self._search_possible_chunk_regions(
-            max_chunk_region, peak_node
-        )
-        best_chunk_region = self.chunk_selector._select_best_chunk_region(
-            possible_chunk_regions, chunk_regions, peak_node, max_chunk_region, mem_peak
-        )
-        best_chunk_region = self.index_tracer.reorder_all(best_chunk_region)
-        return best_chunk_region
-
-    def _stop_search(self, init_mem_peak, mem_peak):
-        sorted_init_mem_peak = sorted(init_mem_peak)
-        if max(mem_peak) < sorted_init_mem_peak[int(len(sorted_init_mem_peak) * 0.5)]:
-            return True
-        return False
-
-    def search_region(self):
-        chunk_infos = []
-        (
-            init_mem_peak,
-            _,
-            active_node,
-        ) = self.memory_estimator.estimate_chunk_inference_mem(
-            self.index_tracer.node_list
-        )
-        mem_peak = init_mem_peak
-
-        while True:
-            chunk_info = self._step_search(mem_peak, active_node, chunk_infos)
-            if chunk_info is None:
-                break
-            chunk_infos.append(chunk_info)
-
-            (
-                mem_peak,
-                _,
-                active_node,
-            ) = self.memory_estimator.estimate_chunk_inference_mem(
-                self.index_tracer.node_list, chunk_infos
-            )
-            if self._stop_search(init_mem_peak, mem_peak):
-                break
-        self.memory_estimator.estimate_chunk_inference_mem(
-            self.index_tracer.node_list, chunk_infos, print_mem=True
-        )
-        return chunk_infos
-
-
-def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
-    new_shape = "["
-    for idx, i in enumerate(shape):
-        if idx == chunk_dim:
-            new_shape += "%s:%s + chunk_size" % (chunk_idx_name, chunk_idx_name)
-        else:
-            new_shape += ":"
-        new_shape += ", "
-    new_shape = new_shape[:-2] + "]"
-    return new_shape
-
-
-def _gen_loop_start(chunk_input, chunk_output, chunk_ouput_dim, chunk_size=2):
-    input_node = chunk_input[0]
-    out_shape = _get_node_shape(chunk_output)
-    out_str = str(list(out_shape))
-    context = (
-        "chunk_result = torch.empty(%s, dtype=%s.dtype, device=%s.device); chunk_size = %d\nfor chunk_idx in range"
-        % (out_str, input_node.name, input_node.name, chunk_size)
-    )
-    context += "(0, %d, chunk_size):\n" % (out_shape[chunk_ouput_dim])
-    return context
-
-
-def _gen_loop_end(
-    chunk_inputs, chunk_non_compute_inputs, chunk_outputs, chunk_outputs_dim, node_list
-):
-    chunk_outputs_name = chunk_outputs.name
-    chunk_outputs_idx = _find_idx_by_name(chunk_outputs_name, node_list)
-    chunk_output_shape = chunk_outputs.meta["tensor_meta"].shape
-    chunk_slice = _gen_chunk_slice_dim(
-        chunk_outputs_dim, "chunk_idx", chunk_output_shape
-    )
-    context = "    chunk_result%s = %s;  %s = None\n" % (
-        chunk_slice,
-        chunk_outputs_name,
-        chunk_outputs_name,
-    )
-    context += (
-        chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None"
-    )
-
-    # determine if its the last use for chunk input
-    for chunk_input in chunk_inputs + chunk_non_compute_inputs:
-        if all(
-            [
-                _find_idx_by_name(user.name, node_list) <= chunk_outputs_idx
-                for user in chunk_input.users.keys()
-            ]
-        ):
-            context += ";  %s = None" % chunk_input.name
-
-    context += "\n"
-    return context
-
-
-def _find_chunk_all_input_nodes(nodes: List[Node]):
-    """
-    Find non-compute input and output node names.
-    input nodes are nodes used in the list
-    output nodes are nodes will use nodes in the list
-    """
-    input_nodes = []
-    for node in nodes:
-        for input_node in node._input_nodes.keys():
-            if input_node not in nodes and input_node not in input_nodes:
-                input_nodes.append(input_node)
-    return input_nodes
-
-
-def _find_chunk_compute_input_and_output_nodes(nodes: List[Node]):
-    """
-    Find non-compute input and output node names.
-    input nodes are nodes used in the list
-    output nodes are nodes will use nodes in the list
-    """
-    input_nodes = []
-    output_nodes = []
-
-    # if a node has an input node which is not in the node list
-    # we treat that input node as the input of the checkpoint function
-    for node in nodes:
-        for input_node in node._input_nodes.keys():
-            if (
-                input_node not in nodes
-                and input_node not in input_nodes
-                and not _is_non_compute_node_except_placeholder(input_node)
-            ):
-                input_nodes.append(input_node)
-
-    # if a node has a user node which is not in the node list
-    # we treat that user node as the node receiving the current node output
-    for node in nodes:
-        for output_node in node.users.keys():
-            if (
-                output_node not in nodes
-                and node not in output_nodes
-                and not _is_non_compute_node_except_placeholder_output(output_node)
-            ):
-                output_nodes.append(node)
-
-    return input_nodes, output_nodes
-
-
-def _find_idx_by_name(name, nodes_list):
-    for idx, node in enumerate(nodes_list):
-        if node.name == name:
-            return idx
-    raise RuntimeError("name %s not found in node list" % name)
-
-
-def _replace_name(context, name_from, name_to):
-    patterns = [(" ", " "), (" ", "."), (" ", ","), ("(", ")"), ("(", ","), (" ", ")")]
-    for p in patterns:
-        source = p[0] + name_from + p[1]
-        target = p[0] + name_to + p[1]
-        if source in context:
-            context = context.replace(source, target)
-    return context
-
-
-def _replace_reshape_size(context, node_name, reshape_size_dict):
-    if node_name not in reshape_size_dict:
-        return context
-    for size_name, size_value in reshape_size_dict[node_name].items():
-        context = context.replace(size_name, size_value)
-    return context
-
-
-def emit_code_with_chunk(
-    body,
-    nodes,
-    emit_node_func,
-    delete_unused_value_func,
-    chunk_region_search,
-    chunk_infos
-):
-    """Emit code with nested activation checkpoint
-    When we detect some of the node.activation_checkpoint is a List, we will use
-    this function to emit the activation checkpoint codes.
-
-    Args:
-        body: forward code
-        ckpt_func: checkpoint functions code
-        nodes: graph.nodes
-        emit_node_func: function to emit node
-        delete_unused_value_func: function to remove the unused value
-    """
-    node_list = list(nodes)
-
-    chunk_regions = [i["region"] for i in chunk_infos]
-    chunk_starts = [i[0] for i in chunk_regions]
-    chunk_ends = [i[1] for i in chunk_regions]
-
-    chunk_inputs = [i["inputs"] for i in chunk_infos]
-    chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos]
-    chunk_inputs_dim = [i["inputs_dim"] for i in chunk_infos]
-    chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [
-        j.name for i in chunk_inputs_non_chunk for j in i
-    ]
-
-    chunk_outputs = [i["outputs"][0] for i in chunk_infos]
-    chunk_outputs_dim = [i["outputs_dim"] for i in chunk_infos]
-
-    node_list = chunk_region_search.index_tracer.reorder_node_list(node_list)
-    node_idx = 0
-    region_idx = 0
-    within_chunk_region = False
-
-    while node_idx < len(node_list):
-        node = node_list[node_idx]
-
-        if node_idx in chunk_starts:
-            within_chunk_region = True
-            region_idx = chunk_starts.index(node_idx)
-            body.append(
-                _gen_loop_start(
-                    chunk_inputs[region_idx],
-                    chunk_outputs[region_idx],
-                    chunk_outputs_dim[region_idx],
-                    chunk_infos[region_idx]["chunk_size"],
-                )
-            )
-
-        if within_chunk_region:
-            emit_node_func(node, body)
-            # replace input var with chunk var
-            for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
-                for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
-                    if idx == node_idx:
-                        chunk_slice = _gen_chunk_slice_dim(
-                            dim[0], "chunk_idx", _get_node_shape(input_node)
-                        )
-                        body[-1] = _replace_name(
-                            body[-1], input_node.name, input_node.name + chunk_slice
-                        )
-            # ones like
-            if "ones_like" in node.name:
-                meta_node = chunk_region_search.index_tracer.node_list[node_idx]
-                chunk_dim = chunk_infos[region_idx]["node_chunk_dim"][meta_node][
-                    "chunk_dim"
-                ]
-                if _get_node_shape(meta_node)[chunk_dim] != 1:
-                    source_node = meta_node.args[0].args[0]
-                    if (
-                        source_node not in chunk_infos[region_idx]["node_chunk_dim"]
-                        or chunk_infos[region_idx]["node_chunk_dim"][source_node][
-                            "chunk_dim"
-                        ]
-                        is None
-                    ):
-                        chunk_slice = _gen_chunk_slice_dim(
-                            chunk_dim, "chunk_idx", _get_node_shape(node)
-                        )
-                        body[-1] = _replace_name(
-                            body[-1], node.args[0].name, node.args[0].name + chunk_slice
-                        )
-            body[-1] = _replace_reshape_size(
-                body[-1], node.name, chunk_infos[region_idx]["reshape_size"]
-            )
-            body[-1] = "    " + body[-1]
-            delete_unused_value_func(node, body, chunk_inputs_names)
-        else:
-            emit_node_func(node, body)
-            if node_idx not in chunk_inputs:
-                delete_unused_value_func(node, body, chunk_inputs_names)
-
-        if node_idx in chunk_ends:
-            body.append(
-                _gen_loop_end(
-                    chunk_inputs[region_idx],
-                    chunk_inputs_non_chunk[region_idx],
-                    chunk_outputs[region_idx],
-                    chunk_outputs_dim[region_idx],
-                    node_list,
-                )
-            )
-            within_chunk_region = False
-
-        node_idx += 1
-
-
-if CODEGEN_AVAILABLE:
-
-    class ChunkCodeGen(CodeGen):
-        def __init__(self, meta_graph, max_memory=None):
-            super().__init__()
-            self.meta_graph = meta_graph
-            self.max_memory = max_memory
-            self.meta_node = list(meta_graph.graph.nodes)
-            # find the chunk regions
-            self.chunk_region_search = ChunkRegionSearch(meta_graph, max_memory)
-            self.chunk_infos = self.chunk_region_search.search_region()
-
-        def _gen_python_code(
-            self, nodes, root_module: str, namespace: _Namespace
-        ) -> PythonCode:
-            free_vars: List[str] = []
-            body: List[str] = []
-            globals_: Dict[str, Any] = {}
-            wrapped_fns: Dict[str, None] = {}
-
-            # Wrap string in list to pass by reference
-            maybe_return_annotation: List[str] = [""]
-
-            def add_global(name_hint: str, obj: Any):
-                """Add an obj to be tracked as a global.
-
-                We call this for names that reference objects external to the
-                Graph, like functions or types.
-
-                Returns: the global name that should be used to reference 'obj' in generated source.
-                """
-                if (
-                    _is_from_torch(obj) and obj != torch.device
-                ):  # to support registering torch.device
-                    # HACK: workaround for how torch custom ops are registered. We
-                    # can't import them like normal modules so they must retain their
-                    # fully qualified name.
-                    return _get_qualified_name(obj)
-
-                # normalize the name hint to get a proper identifier
-                global_name = namespace.create_name(name_hint, obj)
-
-                if global_name in globals_:
-                    assert globals_[global_name] is obj
-                    return global_name
-                globals_[global_name] = obj
-                return global_name
-
-            # set _custom_builtins here so that we needn't import colossalai in forward
-            _custom_builtins["colossalai"] = _CustomBuiltin(
-                "import colossalai", colossalai
-            )
-
-            # Pre-fill the globals table with registered builtins.
-            for name, (_, obj) in _custom_builtins.items():
-                add_global(name, obj)
-
-            def type_repr(o: Any):
-                if o == ():
-                    # Empty tuple is used for empty tuple type annotation Tuple[()]
-                    return "()"
-
-                typename = _type_repr(o)
-
-                if hasattr(o, "__origin__"):
-                    # This is a generic type, e.g. typing.List[torch.Tensor]
-                    origin_type = _origin_type_map.get(o.__origin__, o.__origin__)
-                    origin_typename = add_global(_type_repr(origin_type), origin_type)
-
-                    if hasattr(o, "__args__"):
-                        # Assign global names for each of the inner type variables.
-                        args = [type_repr(arg) for arg in o.__args__]
-
-                        if len(args) == 0:
-                            # Bare type, such as `typing.Tuple` with no subscript
-                            # This code-path used in Python < 3.9
-                            return origin_typename
-
-                        return f'{origin_typename}[{",".join(args)}]'
-                    else:
-                        # Bare type, such as `typing.Tuple` with no subscript
-                        # This code-path used in Python 3.9+
-                        return origin_typename
-
-                # Common case: this is a regular module name like 'foo.bar.baz'
-                return add_global(typename, o)
-
-            def _format_args(
-                args: Tuple[Argument, ...], kwargs: Dict[str, Argument]
-            ) -> str:
-                def _get_repr(arg):
-                    # Handle NamedTuples (if it has `_fields`) via add_global.
-                    if isinstance(arg, tuple) and hasattr(arg, "_fields"):
-                        qualified_name = _get_qualified_name(type(arg))
-                        global_name = add_global(qualified_name, type(arg))
-                        return f"{global_name}{repr(tuple(arg))}"
-                    return repr(arg)
-
-                args_s = ", ".join(_get_repr(a) for a in args)
-                kwargs_s = ", ".join(f"{k} = {_get_repr(v)}" for k, v in kwargs.items())
-                if args_s and kwargs_s:
-                    return f"{args_s}, {kwargs_s}"
-                return args_s or kwargs_s
-
-            # Run through reverse nodes and record the first instance of a use
-            # of a given node. This represents the *last* use of the node in the
-            # execution order of the program, which we will use to free unused
-            # values
-            node_to_last_use: Dict[Node, Node] = {}
-            user_to_last_uses: Dict[Node, List[Node]] = {}
-
-            def register_last_uses(n: Node, user: Node):
-                if n not in node_to_last_use:
-                    node_to_last_use[n] = user
-                    user_to_last_uses.setdefault(user, []).append(n)
-
-            for node in reversed(nodes):
-                map_arg(node.args, lambda n: register_last_uses(n, node))
-                map_arg(node.kwargs, lambda n: register_last_uses(n, node))
-
-            _delete_free_var_from_last_use(user_to_last_uses)
-
-            # NOTE: we add a variable to distinguish body and ckpt_func
-            def delete_unused_values(user: Node, body, to_keep=[]):
-                """
-                Delete values after their last use. This ensures that values that are
-                not used in the remainder of the code are freed and the memory usage
-                of the code is optimal.
-                """
-                if user.op == "placeholder":
-                    return
-                if user.op == "output":
-                    body.append("\n")
-                    return
-                nodes_to_delete = user_to_last_uses.get(user, [])
-                nodes_to_delete = [i for i in nodes_to_delete if i.name not in to_keep]
-                if len(nodes_to_delete):
-                    to_delete_str = " = ".join(
-                        [repr(n) for n in nodes_to_delete] + ["None"]
-                    )
-                    body.append(f";  {to_delete_str}\n")
-                else:
-                    body.append("\n")
-
-            # NOTE: we add a variable to distinguish body and ckpt_func
-            def emit_node(node: Node, body):
-                maybe_type_annotation = (
-                    "" if node.type is None else f" : {type_repr(node.type)}"
-                )
-                if node.op == "placeholder":
-                    assert isinstance(node.target, str)
-                    maybe_default_arg = (
-                        "" if not node.args else f" = {repr(node.args[0])}"
-                    )
-                    free_vars.append(
-                        f"{node.target}{maybe_type_annotation}{maybe_default_arg}"
-                    )
-                    raw_name = node.target.replace("*", "")
-                    if raw_name != repr(node):
-                        body.append(f"{repr(node)} = {raw_name}\n")
-                    return
-                elif node.op == "call_method":
-                    assert isinstance(node.target, str)
-                    body.append(
-                        f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}"
-                        f"({_format_args(node.args[1:], node.kwargs)})"
-                    )
-                    return
-                elif node.op == "call_function":
-                    assert callable(node.target)
-                    # pretty print operators
-                    if (
-                        node.target.__module__ == "_operator"
-                        and node.target.__name__ in magic_methods
-                    ):
-                        assert isinstance(node.args, tuple)
-                        body.append(
-                            f"{repr(node)}{maybe_type_annotation} = "
-                            f"{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}"
-                        )
-                        return
-
-                    # pretty print inplace operators; required for jit.script to work properly
-                    # not currently supported in normal FX graphs, but generated by torchdynamo
-                    if (
-                        node.target.__module__ == "_operator"
-                        and node.target.__name__ in inplace_methods
-                    ):
-                        body.append(
-                            f"{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  "
-                            f"{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}"
-                        )
-                        return
-
-                    qualified_name = _get_qualified_name(node.target)
-                    global_name = add_global(qualified_name, node.target)
-                    # special case for getattr: node.args could be 2-argument or 3-argument
-                    # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
-                    if (
-                        global_name == "getattr"
-                        and isinstance(node.args, tuple)
-                        and isinstance(node.args[1], str)
-                        and node.args[1].isidentifier()
-                        and len(node.args) == 2
-                    ):
-                        body.append(
-                            f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}"
-                        )
-                        return
-                    body.append(
-                        f"{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})"
-                    )
-                    if node.meta.get("is_wrapped", False):
-                        wrapped_fns.setdefault(global_name)
-                    return
-                elif node.op == "call_module":
-                    assert isinstance(node.target, str)
-                    body.append(
-                        f"{repr(node)}{maybe_type_annotation} = "
-                        f"{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})"
-                    )
-                    return
-                elif node.op == "get_attr":
-                    assert isinstance(node.target, str)
-                    body.append(
-                        f"{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}"
-                    )
-                    return
-                elif node.op == "output":
-                    if node.type is not None:
-                        maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
-                    body.append(self.generate_output(node.args[0]))
-                    return
-                raise NotImplementedError(f"node: {node.op} {node.target}")
-
-            # Modified for activation checkpointing
-            ckpt_func = []
-
-            # if any node has a list of labels for activation_checkpoint, we
-            # will use nested type of activation checkpoint codegen
-            emit_code_with_chunk(
-                body,
-                nodes,
-                emit_node,
-                delete_unused_values,
-                self.chunk_region_search,
-                self.chunk_infos
-            )
-
-            if len(body) == 0:
-                # If the Graph has no non-placeholder nodes, no lines for the body
-                # have been emitted. To continue to have valid Python code, emit a
-                # single pass statement
-                body.append("pass\n")
-
-            if len(wrapped_fns) > 0:
-                wrap_name = add_global("wrap", torch.fx.wrap)
-                wrap_stmts = "\n".join(
-                    [f'{wrap_name}("{name}")' for name in wrapped_fns]
-                )
-            else:
-                wrap_stmts = ""
-
-            if self._body_transformer:
-                body = self._body_transformer(body)
-
-            for name, value in self.additional_globals():
-                add_global(name, value)
-
-            # as we need colossalai.utils.checkpoint, we need to import colossalai
-            # in forward function
-            prologue = self.gen_fn_def(free_vars, maybe_return_annotation[0])
-            prologue = "".join(ckpt_func) + prologue
-            prologue = prologue
-
-            code = "".join(body)
-            code = "\n".join("    " + line for line in code.split("\n"))
-            fn_code = f"""
-{wrap_stmts}
-
-{prologue}
-{code}"""
-            # print(fn_code)
-            return PythonCode(fn_code, globals_)
diff --git a/colossalai/autochunk/chunk_region_search.py b/colossalai/autochunk/chunk_region_search.py
new file mode 100644
index 000000000000..0d0825f2584e
--- /dev/null
+++ b/colossalai/autochunk/chunk_region_search.py
@@ -0,0 +1,211 @@
+from .index_tracer import IndexTracer
+from .memory_estiamtor import MemoryEstimator
+from .chunk_selector import ChunkSelector
+import copy
+from .utils import is_non_compute_node, is_non_compute_node_except_placeholder, get_node_shape
+
+
+class ChunkRegionSearch(object):
+    def __init__(self, gm, max_memory=None) -> None:
+        self.gm = gm
+        self.index_tracer = IndexTracer(list(gm.graph.nodes))
+        self.index_tracer.trace_index()
+        self.memory_estimator = MemoryEstimator(self.index_tracer)
+        self.chunk_selector = ChunkSelector(
+            self.index_tracer, self.memory_estimator, max_memory=max_memory
+        )
+
+    def _find_peak_node(self, mem_peak):
+        max_value = max(mem_peak)
+        max_idx = mem_peak.index(max_value)
+        return max_idx
+
+    def _get_free_var(self):
+        free_var_idx = []
+        for idx, n in enumerate(self.index_tracer.node_list):
+            if n.op == "placeholder":
+                free_var_idx.append(idx)
+        return free_var_idx
+
+    def _get_min_free_var(self, active_node_list, free_vars):
+        min_len = 999
+        for idx, n in enumerate(active_node_list):
+            if idx in free_vars:
+                continue
+            if len(n) < min_len:
+                min_len = len(n)
+        return min_len
+
+    def _search_max_chunk_region(self, active_node, peak_node, chunk_regions):
+        free_vars = self._get_free_var()
+        free_var_num = len(free_vars)
+        active_node_num = [len(i) for i in active_node]
+        min_active_node_num = min(active_node_num[free_var_num:])
+        threshold = max(free_var_num, min_active_node_num)
+
+        # from peak_node to free_var
+        inside_flag = False
+        chunk_region_start = free_var_num
+        for i in range(peak_node, -1, -1):
+            if active_node_num[i] <= threshold:
+                inside_flag = True
+            if inside_flag and active_node_num[i] > threshold:
+                chunk_region_start = i + 1
+                break
+
+        # from peak_node to len-2
+        inside_flag = False
+        chunk_region_end = len(active_node) - 1
+        for i in range(peak_node, len(active_node)):
+            if active_node_num[i] <= threshold:
+                inside_flag = True
+            if inside_flag and active_node_num[i] > threshold:
+                chunk_region_end = i
+                break
+
+        for i in chunk_regions:
+            region = i["region"]
+            if chunk_region_start >= region[0] and chunk_region_end <= region[1]:
+                return None
+            elif (
+                region[0] <= chunk_region_start <= region[1]
+                and chunk_region_end > region[1]
+            ):
+                chunk_region_start = region[1] + 1
+            elif (
+                region[0] <= chunk_region_end <= region[1]
+                and chunk_region_start < region[0]
+            ):
+                chunk_region_end = region[0] - 1
+        return chunk_region_start, chunk_region_end
+
+    def _is_not_compute(self, trace, chunk_range, dim_idx):
+        if trace["idx"][dim_idx] not in trace["compute"]:
+            return True
+        if trace["idx"][dim_idx] in trace["compute"] and all(
+            i < chunk_range[0] or i > chunk_range[1]
+            for i in trace["compute"][trace["idx"][dim_idx]]
+        ):
+            return True
+        return False
+
+    def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
+        start_traces = input_trace[start_idx]
+        end_trace = output_trace[end_idx]
+        end_node = self.index_tracer.node_list[end_idx]
+        chunk_infos = []
+        for end_dim, _ in enumerate(end_trace["idx"]):
+            if len(start_traces) > 1:
+                continue
+            for start_node, start_trace in start_traces.items():
+                for start_dim, _ in enumerate(start_trace["idx"]):
+                    # dim size cannot be 1
+                    if (
+                        get_node_shape(end_node)[end_dim] == 1
+                        or get_node_shape(start_node)[start_dim] == 1
+                    ):
+                        continue
+                    # check index source align
+                    if not self.index_tracer.check_index_source(
+                        start_dim, start_node, start_idx, end_dim, end_node
+                    ):
+                        continue
+                    # check index copmute
+                    if not self.index_tracer.check_index_compute(
+                        start_idx, end_dim, end_node, end_idx
+                    ):
+                        continue
+                    # flow search
+                    chunk_info = self.index_tracer.flow_search(
+                        start_idx, start_dim, end_idx, end_dim
+                    )
+                    if chunk_info is None:
+                        continue
+                    # check index copmute
+                    if not self.index_tracer.check_index_duplicate(chunk_info):
+                        continue
+                    chunk_infos.append(chunk_info)
+        return chunk_infos
+
+    def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
+        possible_chunk_region = []
+        output_trace = copy.deepcopy(self.index_tracer.idx_trace_list)
+        input_trace = []  # trace of a node's input nodes
+        for _, n in enumerate(self.index_tracer.node_list):
+            cur_trace = {}
+            for arg in n.args:
+                if type(arg) == type(n) and not is_non_compute_node_except_placeholder(
+                    arg
+                ):
+                    cur_trace[arg] = self.index_tracer._find_trace_from_node(arg)
+            input_trace.append(cur_trace)
+
+        for start_idx in range(max_chunk_region[0], peak_node + 1):
+            for end_idx in range(peak_node, max_chunk_region[1] + 1):
+                # skip non compute nodes
+                if is_non_compute_node(
+                    self.index_tracer.node_list[start_idx]
+                ) or is_non_compute_node(self.index_tracer.node_list[end_idx]):
+                    continue
+
+                # select free dim
+                chunk_info = self._find_free_dim(
+                    input_trace, output_trace, start_idx, end_idx
+                )
+                if len(chunk_info) > 0:
+                    possible_chunk_region.extend(chunk_info)
+        return possible_chunk_region
+
+    def _step_search(self, mem_peak, active_node, chunk_regions):
+        peak_node = self._find_peak_node(mem_peak)
+        max_chunk_region = self._search_max_chunk_region(
+            active_node, peak_node, chunk_regions
+        )
+        if max_chunk_region == None:
+            return None
+        possible_chunk_regions = self._search_possible_chunk_regions(
+            max_chunk_region, peak_node
+        )
+        best_chunk_region = self.chunk_selector._select_best_chunk_region(
+            possible_chunk_regions, chunk_regions, peak_node, max_chunk_region, mem_peak
+        )
+        best_chunk_region = self.index_tracer.reorder_all(best_chunk_region)
+        return best_chunk_region
+
+    def _stop_search(self, init_mem_peak, mem_peak):
+        sorted_init_mem_peak = sorted(init_mem_peak)
+        if max(mem_peak) < sorted_init_mem_peak[int(len(sorted_init_mem_peak) * 0.5)]:
+            return True
+        return False
+
+    def search_region(self):
+        chunk_infos = []
+        (
+            init_mem_peak,
+            _,
+            active_node,
+        ) = self.memory_estimator.estimate_chunk_inference_mem(
+            self.index_tracer.node_list
+        )
+        mem_peak = init_mem_peak
+
+        while True:
+            chunk_info = self._step_search(mem_peak, active_node, chunk_infos)
+            if chunk_info is None:
+                break
+            chunk_infos.append(chunk_info)
+
+            (
+                mem_peak,
+                _,
+                active_node,
+            ) = self.memory_estimator.estimate_chunk_inference_mem(
+                self.index_tracer.node_list, chunk_infos
+            )
+            if self._stop_search(init_mem_peak, mem_peak):
+                break
+        self.memory_estimator.estimate_chunk_inference_mem(
+            self.index_tracer.node_list, chunk_infos, print_mem=True
+        )
+        return chunk_infos
+
diff --git a/colossalai/autochunk/chunk_selector.py b/colossalai/autochunk/chunk_selector.py
new file mode 100644
index 000000000000..f84322082cc4
--- /dev/null
+++ b/colossalai/autochunk/chunk_selector.py
@@ -0,0 +1,221 @@
+from .index_tracer import IndexTracer
+from .memory_estiamtor import MemoryEstimator
+from .utils import is_non_compute_node
+
+
+class ChunkSelector(object):
+    def __init__(
+        self,
+        index_tracer: IndexTracer,
+        memory_estimator: MemoryEstimator,
+        max_memory=None,
+    ):
+        self.index_tracer = index_tracer
+        self.memory_estimator = memory_estimator
+        if max_memory is not None:
+            self.stratge = "fit_memory"
+            self.max_memory = max_memory  # MB
+        else:
+            self.stratge = "min_memory"
+
+    def _select_best_chunk_region(
+        self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
+    ):
+        if self.stratge == "min_memory":
+            best_region = self._select_min_memory_chunk_region(
+                possible_chunk_regions,
+                chunk_infos,
+                peak_node,
+                max_chunk_region,
+                mem_peak,
+            )
+        elif self.stratge == "fit_memory":
+            best_region = self._select_fit_memory_chunk_region(
+                possible_chunk_regions,
+                chunk_infos,
+                peak_node,
+                max_chunk_region,
+                mem_peak,
+            )
+        else:
+            raise RuntimeError()
+        return best_region
+
+    def _select_fit_memory_chunk_region(
+        self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
+    ):
+        # stop chunk if max memory satisfy memory limit
+        if max(mem_peak) < self.max_memory:
+            return None
+
+        # remove illegal regions
+        illegal_regions = []
+        for i in possible_chunk_regions:
+            if not self._is_legal_region(i, chunk_infos):
+                illegal_regions.append(i)
+        for i in illegal_regions:
+            if i in possible_chunk_regions:
+                possible_chunk_regions.remove(i)
+
+        if len(possible_chunk_regions) == 0:
+            return None
+
+        # get mem for chunk region
+        regions_dict = []
+        for region in possible_chunk_regions:
+            cur_region = region.copy()
+            cur_node_list, cur_region = self.index_tracer.tmp_reorder(
+                self.index_tracer.node_list, cur_region
+            )
+            cur_chunk_infos = chunk_infos + [cur_region]
+            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
+                cur_node_list, cur_chunk_infos
+            )[0]
+            cur_chunk_region_peak = cur_mem_peak[
+                max_chunk_region[0] : max_chunk_region[1] + 1
+            ]
+            cur_chunk_region_max_peak = max(cur_chunk_region_peak)
+            if cur_chunk_region_max_peak < self.max_memory:
+                regions_dict.append(
+                    {
+                        "chunk_info": region,
+                        "chunk_max_mem": cur_chunk_region_max_peak,
+                        "chunk_len": self._get_compute_node_num(
+                            region["region"][0], region["region"][1]
+                        ),
+                        "reorder_chunk_info": cur_region,
+                        "reorder_node_list": cur_node_list,
+                    }
+                )
+        # no region found
+        if len(regions_dict) == 0:
+            raise RuntimeError("Search failed. Try a larger memory threshold.")
+
+        # select the min chunk len
+        chunk_len = [i["chunk_len"] for i in regions_dict]
+        best_region_idx = chunk_len.index(min(chunk_len))
+        best_region = regions_dict[best_region_idx]
+
+        # get max chunk size
+        best_region = self._get_fit_chunk_size(best_region, chunk_infos)
+        return best_region
+
+    def _get_fit_chunk_size(self, chunk_region_dict, chunk_infos):
+        chunk_size = 1
+        reorder_chunk_info = chunk_region_dict["reorder_chunk_info"]
+        reorder_chunk_info["chunk_size"] = chunk_size
+        cur_chunk_max_mem = 0
+        # search a region
+        while cur_chunk_max_mem < self.max_memory:
+            chunk_size *= 2
+            reorder_chunk_info["chunk_size"] = chunk_size
+            cur_chunk_infos = chunk_infos + [reorder_chunk_info]
+            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
+                chunk_region_dict["reorder_node_list"], cur_chunk_infos
+            )[0]
+            cur_chunk_max_mem = max(
+                cur_mem_peak[
+                    reorder_chunk_info["region"][0] : reorder_chunk_info["region"][1]
+                    + 1
+                ]
+            )
+        # search exact size
+        chunk_info = chunk_region_dict["chunk_info"]
+        chunk_info["chunk_size"] = self._chunk_size_binary_search(
+            chunk_size // 2, chunk_size, chunk_region_dict, chunk_infos
+        )
+        return chunk_info
+
+    def _chunk_size_binary_search(self, l, r, chunk_region_dict, chunk_infos):
+        if l >= 16:
+            gap = 4
+        else:
+            gap = 1
+        chunk_info = chunk_region_dict["reorder_chunk_info"]
+        while r >= l + gap:
+            mid = int((l + r) / 2 + 0.5)
+            chunk_info["chunk_size"] = mid
+            cur_chunk_infos = chunk_infos + [chunk_info]
+            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
+                chunk_region_dict["reorder_node_list"], cur_chunk_infos
+            )[0]
+            cur_chunk_max_mem = max(
+                cur_mem_peak[chunk_info["region"][0] : chunk_info["region"][1] + 1]
+            )
+            if cur_chunk_max_mem >= self.max_memory:
+                r = mid - gap
+            else:
+                l = mid + gap
+        return l
+
+    def _get_compute_node_num(self, start, end):
+        count = 0
+        for i in self.index_tracer.node_list[start : end + 1]:
+            if not is_non_compute_node(i):
+                count += 1
+        return count
+
+    def _select_min_memory_chunk_region(
+        self, possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
+    ):
+        # remove illegal regions
+        illegal_regions = []
+        for i in possible_chunk_regions:
+            if not self._is_legal_region(i, chunk_infos):
+                illegal_regions.append(i)
+        for i in illegal_regions:
+            if i in possible_chunk_regions:
+                possible_chunk_regions.remove(i)
+
+        if len(possible_chunk_regions) == 0:
+            return None
+
+        # get mem for chunk region
+        regions_dict = []
+        for region in possible_chunk_regions:
+            cur_region = region.copy()
+            cur_node_list, cur_region = self.index_tracer.tmp_reorder(
+                self.index_tracer.node_list, cur_region
+            )
+            cur_chunk_infos = chunk_infos + [cur_region]
+            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
+                cur_node_list, cur_chunk_infos
+            )[0]
+            cur_chunk_region_peak = cur_mem_peak[
+                max_chunk_region[0] : max_chunk_region[1] + 1
+            ]
+            cur_chunk_region_max_peak = max(cur_chunk_region_peak)
+            regions_dict.append(
+                {
+                    "chunk_info": region,
+                    "chunk_max_mem": cur_chunk_region_max_peak,
+                    "chunk_len": self._get_compute_node_num(
+                        region["region"][0], region["region"][1]
+                    ),
+                    "reorder_chunk_info": cur_region,
+                    "reorder_node_list": cur_node_list,
+                }
+            )
+
+        # select the min mem
+        chunk_max_mem = [i["chunk_max_mem"] for i in regions_dict]
+        best_region_idx = chunk_max_mem.index(min(chunk_max_mem))
+        best_region = regions_dict[best_region_idx]["chunk_info"]
+        if best_region is not None:
+            best_region["chunk_size"] = 1
+        return best_region
+
+    def _is_legal_region(self, cur_chunk_info, chunk_infos):
+        (chunk_region_start, chunk_region_end) = cur_chunk_info["region"]
+        if cur_chunk_info in chunk_infos:
+            return False
+        if chunk_region_end < chunk_region_start:
+            return False
+        for i in chunk_infos:
+            region = i["region"]
+            if not (
+                (chunk_region_start > region[1] and chunk_region_end > region[1])
+                or (chunk_region_start < region[0] and chunk_region_end < region[0])
+            ):
+                return False
+        return True
diff --git a/colossalai/autochunk/index_tracer.py b/colossalai/autochunk/index_tracer.py
new file mode 100644
index 000000000000..7a86f3c998fb
--- /dev/null
+++ b/colossalai/autochunk/index_tracer.py
@@ -0,0 +1,1056 @@
+import copy
+
+from .utils import (
+    find_chunk_all_input_nodes,
+    find_chunk_compute_input_and_output_nodes,
+    find_idx_by_name,
+    get_node_shape,
+    is_non_compute_node,
+    is_non_compute_node_except_placeholder,
+)
+
+
+class IndexTracer(object):
+    def __init__(self, node_list) -> None:
+        self.node_list = node_list
+        self.idx_trace_list = self._init_idx_trace_list()
+        self.idx_trace_equal = []
+        self.idx_view_list = {}
+        self.idx_count = -1
+        self.all_reorder_map = {i: i for i in range(len(self.idx_trace_list))}
+
+    def _init_idx_trace_list(self):
+        idx_trace_list = []
+        for n in self.node_list:
+            if get_node_shape(n) != None:
+                cur_trace = {
+                    "idx": [None for _ in range(len(get_node_shape(n)))],
+                    "compute": [[] for _ in range(len(get_node_shape(n)))],
+                    "source": [{} for _ in range(len(get_node_shape(n)))],
+                }
+            else:
+                cur_trace = {"idx": [], "compute": [], "source": []}
+            idx_trace_list.append(cur_trace)
+        return idx_trace_list
+
+    def _add_index(self):
+        """
+        Update the count and return it. To record the idx number.
+
+        Returns:
+            idx_count: int
+        """
+        self.idx_count += 1
+        return self.idx_count
+
+    def _del_dim(self, idx, dim_idx):
+        self.idx_trace_list[idx]["idx"].pop(dim_idx)
+        self.idx_trace_list[idx]["compute"].pop(dim_idx)
+        self.idx_trace_list[idx]["source"].pop(dim_idx)
+
+    def _add_dim(self, node_idx, dim_idx):
+        self.idx_trace_list[node_idx]["idx"].insert(dim_idx, self._add_index())
+        self.idx_trace_list[node_idx]["compute"].insert(dim_idx, [])
+        self.idx_trace_list[node_idx]["source"].insert(dim_idx, {})
+
+    def _transform_index(self, node, node_dim):
+        node_idx = self._find_idx_trace_from_node(node)
+        dims = list(range(len(node_idx)))
+        return dims[node_dim]
+
+    def _inherit_index(self, node_from, node_from_dim, node_to, node_to_dim):
+        node_from_dim = self._transform_index(node_from, node_from_dim)
+        node_to_dim = self._transform_index(node_to, node_to_dim)
+        node_from_trace = self._find_trace_from_node(node_from)
+        node_to_trace = self._find_trace_from_node(node_to)
+        node_to_trace["idx"][node_to_dim] = node_from_trace["idx"][node_from_dim]
+        node_to_trace["compute"][node_to_dim] = copy.deepcopy(
+            node_from_trace["compute"][node_from_dim]
+        )
+        self._add_source(node_from, node_from_dim, node_to, node_to_dim, init=True)
+
+    def _inherit_all_computation(self, node_from, node_to):
+        node_from_compute = self._find_compute_trace_from_node(node_from)
+        node_to_compute = self._find_compute_trace_from_node(node_to)
+        assert len(node_from_compute) == len(node_to_compute)
+        for i in range(len(node_from_compute)):
+            self._add_source(node_from, i, node_to, i)
+            node_to_compute[i] = copy.deepcopy(node_from_compute[i])
+
+    def _add_source(self, node_from, node_from_dim, node_to, node_to_dim, init=False):
+        node_from_dim = self._transform_index(node_from, node_from_dim)
+        node_from_trace_source = self._find_source_trace_from_node(node_from)
+        node_to_dim = self._transform_index(node_to, node_to_dim)
+        node_to_trace_source = self._find_source_trace_from_node(node_to)
+        node_from_idx = find_idx_by_name(node_from.name, self.node_list)
+        if init:
+            node_to_trace_source[node_to_dim] = {}
+        # add dim to cur new source
+        if node_from_idx not in node_to_trace_source[node_to_dim]:
+            node_to_trace_source[node_to_dim][node_from_idx] = [node_from_dim]
+        else:
+            if node_from_dim not in node_to_trace_source[node_to_dim][node_from_idx]:
+                node_to_trace_source[node_to_dim][node_from_idx].append(node_from_dim)
+        # update inputs source
+        for node_idx, node_dim in node_from_trace_source[node_from_dim].items():
+            if node_idx not in node_to_trace_source[node_to_dim]:
+                node_to_trace_source[node_to_dim][node_idx] = copy.deepcopy(node_dim)
+            else:
+                for d in node_dim:
+                    if d not in node_to_trace_source[node_to_dim][node_idx]:
+                        node_to_trace_source[node_to_dim][node_idx].append(d)
+
+    def _mark_computation_from_node(self, node_from, node_to, exclude=None):
+        if exclude == None:
+            exclude = []
+        else:
+            exclude = [self._transform_index(node_to, i) for i in exclude]
+        node_from_compute = self._find_compute_trace_from_node(node_from)
+        node_to_compute = self._find_compute_trace_from_node(node_to)
+        # assert len(node_from_compute) == len(node_to_compute)
+        for i in range(-1, -min(len(node_from_compute), len(node_to_compute)) - 1, -1):
+            if self._transform_index(node_to, i) in exclude:
+                continue
+            self._add_source(node_from, i, node_to, i)
+            for j in node_from_compute[i]:
+                if j not in node_to_compute[i]:
+                    node_to_compute[i].append(j)
+
+    def _mark_idx_equal(self, node1, dim1, node2, dim2):
+        """
+        Mark 2 index to be equal.
+
+        Args:
+            idx1 (int): index count.
+            idx2 (int): index count.
+        """
+        # node1_idx = _find_idx_by_name(node1.name, self.nodes_list)
+        # node2_idx = _find_idx_by_name(node2.name, self.nodes_list)
+        # if node1_idx > node2_idx:
+        #     self._add_source(node2, dim2, node1, dim1)
+        # else:
+        #     self._add_source(node1, dim1, node2, dim2)
+
+    def _mark_computation(self, node, idx, dim):
+        """
+        Mark some dims of node as computed.
+
+        Args:
+            node (node)
+            idx (int): node index
+            dim (list or int): dims to be marked as computed
+        """
+        if isinstance(dim, int):
+            dim = [dim]
+        dims = list(range(len(get_node_shape(node))))
+        for d in dim:
+            cur_dim = dims[d]
+            if idx not in self.idx_trace_list[idx]["compute"][cur_dim]:
+                self.idx_trace_list[idx]["compute"][cur_dim].append(idx)
+
+    def _find_trace_from_node(self, node):
+        """
+        Find node idx and compute trace by the node.
+
+        Args:
+            node (node)
+        Returns:
+            idx (list): idx of the node
+            compute (list): computed idx of the node.
+        """
+        node_idx = find_idx_by_name(node.name, self.node_list)
+        node_dict = self.idx_trace_list[node_idx]
+        return node_dict
+
+    def _find_source_trace_from_node(self, node):
+        """
+        Find node source trace by the node.
+
+        Args:
+            node (node)
+        Returns:
+            idx (list): idx of the node
+            compute (list): computed idx of the node.
+        """
+        node_idx = find_idx_by_name(node.name, self.node_list)
+        node_dict = self.idx_trace_list[node_idx]
+        return node_dict["source"]
+
+    def _find_idx_trace_from_node(self, node):
+        """
+        Find node idx trace by the node.
+
+        Args:
+            node (node)
+        Returns:
+            idx (list): idx of the node
+        """
+        node_idx = find_idx_by_name(node.name, self.node_list)
+        return self.idx_trace_list[node_idx]["idx"]
+
+    def _find_compute_trace_from_node(self, node):
+        """
+        Find node compute trace by the node.
+
+        Args:
+            node (node)
+        Returns:
+            compute (list): computed idx of the node.
+        """
+        node_idx = find_idx_by_name(node.name, self.node_list)
+        return self.idx_trace_list[node_idx]["compute"]
+
+    def _assign_index_as_input(self, node, node_idx, input_node=None):
+        """
+        Assign node's trace as its input node.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        if input_node == None:
+            input_node = node.args[0]
+        input_node_idx = find_idx_by_name(input_node.name, self.node_list)
+        input_node_idx_trace = self.idx_trace_list[input_node_idx]["idx"]
+
+        new_idx_trace = copy.deepcopy(input_node_idx_trace)
+        self.idx_trace_list[node_idx]["idx"] = new_idx_trace
+
+        self._inherit_all_computation(input_node, node)
+
+    def _assign_all_index(self, node, node_idx):
+        """
+        Add new index for all node's dims.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        shape = node.meta["tensor_meta"].shape
+        new_trace = []
+        for _ in shape:
+            new_trace.append(self._add_index())
+        self.idx_trace_list[node_idx]["idx"] = new_trace
+
+    def _assign_transpose_index(self, node, node_idx):
+        """
+        Assign index for transpose op.
+        1. swap input's dim according to transpose args
+        2. inherit input's computation
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        input_node = node.args[0]
+        tranpose_dim = node.args[1:]
+
+        self._assign_index_as_input(node, node_idx, input_node)
+        self._inherit_index(input_node, tranpose_dim[1], node, tranpose_dim[0])
+        self._inherit_index(input_node, tranpose_dim[0], node, tranpose_dim[1])
+
+    def _assign_permute_index(self, node, node_idx):
+        """
+        Assign index for permute op.
+        1. swap input's dim according to permute args
+        2. inherit input's computation
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        permute_dim = node.args[1:]
+        input_node = node.args[0]
+
+        self._assign_index_as_input(node, node_idx, input_node)
+        for idx, d in enumerate(permute_dim):
+            self._inherit_index(input_node, d, node, idx)
+
+    def _assign_linear_index(self, node, node_idx):
+        """
+        Assign index for linear op.
+        1. copy trace from input node and change last index accroding to weight
+        2. mark equal for input node last index, weight first dim and bias dim.
+        3. inherit input's computation, mark computation for last dim.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        if len(node.args) == 2:
+            input_node, weight = node.args
+            bias = None
+        else:
+            input_node, weight, bias = node.args
+
+        self._assign_index_as_input(node, node_idx)
+        self._inherit_index(weight, 1, node, -1)
+
+        self._mark_computation(node, node_idx, [-1])
+        self._mark_idx_equal(input_node, -1, weight, 0)
+
+        if bias:
+            self._mark_idx_equal(input_node, -1, bias, 0)
+
+    def _assign_matmul_index(self, node, node_idx):
+        """
+        Assign index for matmul op.
+        1. copy trace from matmul_left and change last index accroding to matmul_right. (assert they have same length)
+        2. mark equal for input matmul_left -1 index and matmul_right -2 dim.
+        3. inherit matmul_left and matmul_right computation, mark computation for last dim.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        matmul_left, matmul_right = node.args
+
+        assert len(get_node_shape(matmul_left)) == len(get_node_shape(matmul_right))
+        self._assign_index_as_input(node, node_idx, matmul_left)
+        self._inherit_index(matmul_right, -1, node, -1)
+
+        self._mark_computation_from_node(matmul_right, node, [-1, -2])
+        self._mark_computation(node, node_idx, [-1])
+        self._mark_idx_equal(matmul_left, -1, matmul_right, -2)
+
+    def _assign_layernorm_index(self, node, idx):
+        """
+        Assign index for layernorm op.
+        1. assign index as input node
+        2. inherit computation and mark last 2 dims as computed.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        self._assign_index_as_input(node, idx)
+        self._mark_computation(node, idx, [-1])
+
+    def _assign_elementwise_index(self, node, idx):
+        """
+        Assign index for element-wise op (eg. relu sigmoid add mul).
+        1. assign index as input node
+        2. inherit computation from all input nodes.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        self._assign_index_as_input(node, idx)
+        nodes_in = []
+        for node_in in node.args:
+            if type(node_in) == type(node):
+                nodes_in.append(node_in)
+                self._mark_computation_from_node(node_in, node)
+        assert len(nodes_in) <= 2
+        if len(nodes_in) == 2:
+            node_in0_shape = get_node_shape(nodes_in[0])
+            node_in1_shape = get_node_shape(nodes_in[1])
+            for i in range(-1, -min(len(node_in0_shape), len(node_in1_shape)) - 1, -1):
+                if node_in0_shape[i] == node_in1_shape[i]:
+                    self._mark_idx_equal(nodes_in[0], i, nodes_in[1], i)
+
+    def _assgin_no_change_index(self, node, idx):
+        self._assign_index_as_input(node, idx)
+        for node_in in node.args:
+            if type(node_in) == type(node):
+                self._mark_computation_from_node(node_in, node)
+
+    def _assign_einsum_index(self, node, idx):
+        """
+        Assign index for einsum op.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        patterns = node.args[0]
+        input_nodes = node.args[1:]
+
+        patterns = patterns.replace(" ", "")
+        left, right = patterns.split("->")
+        left = left.split(",")
+
+        all_index = []
+        for i in left:
+            for c in i:
+                all_index.append(c)
+        all_index = set(all_index)
+        free_index = set([i for i in right])
+        sum_index = all_index - free_index
+
+        for right_idx, right_indice in enumerate(right):
+            for left_idx, left_str in enumerate(left):
+                if right_indice in left_str:
+                    source_idx = left_str.index(right_indice)
+                    self._inherit_index(
+                        input_nodes[left_idx], source_idx, node, right_idx
+                    )
+
+        # for i in sum_index:
+        #     for left_idx, left_str in enumerate(left):
+        #         if i in left_str:
+        #             self._mark_computation(node, idx, left_str.index(i))
+        #             break
+
+    def _assign_softmax_index(self, node, idx):
+        """
+        Assign index for softmax op.
+        1. assign index as input node
+        2. inherit computation and mark softmax dim as computed.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        self._assign_index_as_input(node, idx)
+        self._mark_computation(node, idx, [node.kwargs["dim"]])
+
+    def _assign_unsqueeze_index(self, node, node_idx):
+        """
+        Assign index for unsqueeze op.
+        1. assign new index for unsqueeze dim
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        self._del_dim(node_idx, -1)
+        self._assign_index_as_input(node, node_idx)
+        self._add_dim(node_idx, node.args[1])
+
+    def _assign_dropout_index(self, node, node_idx):
+        """
+        Assign index for unsqueeze op.
+        1. assign new index for unsqueeze dim
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        self._assign_index_as_input(node, node_idx)
+
+    def _assign_ones_like_index(self, node, node_idx):
+        """
+        Assign index for oneslike op.
+        1. assign new index for all dim
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        self._assign_all_index(node, node_idx)
+
+    def _assign_view_reshape_index(self, node, node_idx):
+        """
+        Assign index for view and reshape op.
+        1. get origin shape and target shape by meta info.
+        2. compute the real value of -1 in target shape.
+        3. determine changed dim, and assgin index for generated dim.
+        4. log changed dim and generated dim for restore
+        5. inherit computation.
+        6. TODO: look into view list to see whether the view is associated with other,
+           if so assgin equal dim according to previous view.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        # get data, turn into number
+        origin_node = node.args[0]
+        origin_shape = origin_node.meta["tensor_meta"].shape
+        target_shape = []
+        for i in range(1, len(node.args)):
+            if isinstance(node.args[i], int):
+                target_shape.append(node.args[i])
+            else:
+                target_shape.append(node.args[i].meta["fwd_out"][0])
+
+        # compute the value of -1
+        if -1 in target_shape:
+            origin_product = 1
+            for i in origin_shape:
+                origin_product *= i
+            target_product = -1
+            for i in target_shape:
+                target_product *= i
+            shape_idx = target_shape.index(-1)
+            target_shape[shape_idx] = origin_product // target_product
+
+        # determine changed dim
+        len_diff = len(origin_shape) - len(target_shape)
+        if len_diff == 1:
+            # dim merge
+            dim_equal = [i == j for i, j in zip(origin_shape[:-1], target_shape)]
+            dim_to = [dim_equal.index(False)]
+            dim_from = [dim_equal.index(False), dim_equal.index(False) + 1]
+            self._add_dim(node_idx, -1)
+        elif len_diff == -1:
+            # dim expand
+            dim_equal = [i == j for i, j in zip(origin_shape, target_shape[:-1])]
+            dim_from = [dim_equal.index(False)]
+            dim_to = [dim_equal.index(False), dim_equal.index(False) + 1]
+            self._del_dim(node_idx, -1)
+        else:
+            raise NotImplementedError(
+                "shape"
+                + str(origin_shape)
+                + "and"
+                + str(target_shape)
+                + "view not implemented"
+            )
+
+        # get new index
+        origin_trace = self._find_idx_trace_from_node(origin_node)
+        self._assign_index_as_input(node, node_idx, origin_node)
+        dim_from.reverse()
+        for i in dim_from:
+            self._del_dim(node_idx, i)
+        for i in dim_to:
+            self._add_dim(node_idx, i)
+
+        # inherit computation
+        compute_log = self._find_compute_trace_from_node(origin_node)
+        for i in dim_from:
+            if origin_trace[i] in compute_log:
+                for j in dim_to:
+                    self._mark_computation(node, node_idx, [j])
+                break
+
+        # log view, not used now
+        view_dict = {
+            "idx_from": [origin_trace[i] for i in dim_from],
+            "dim_from": dim_from,
+            "idx_to": [self.idx_trace_list[node_idx]["idx"][i] for i in dim_to],
+            "dim_to": dim_to,
+        }
+        self.idx_view_list[node] = view_dict
+
+    def _merge_equal_idx(self):
+        idx_equal = copy.deepcopy(self.idx_trace_equal)
+        idx_equal.reverse()
+        for idx in idx_equal:
+            merge_to = min(idx)
+            merge_from = max(idx)
+            for trace in self.idx_trace_list:
+                if merge_from in trace["idx"]:
+                    trace["idx"] = [
+                        merge_to if i == merge_from else i for i in trace["idx"]
+                    ]
+
+    def trace_index(self):
+        for idx, node in enumerate(self.node_list):
+            if node.op == "placeholder":
+                self._assign_all_index(node, idx)
+            elif node.op == "call_method":
+                if "transpose" in node.name:
+                    self._assign_transpose_index(node, idx)
+                elif "permute" in node.name:
+                    self._assign_permute_index(node, idx)
+                elif "view" in node.name or "reshape" in node.name:
+                    self._assign_view_reshape_index(node, idx)
+                elif "unsqueeze" in node.name:
+                    self._assign_unsqueeze_index(node, idx)
+                elif any(i in node.name for i in ["to", "contiguous"]):
+                    self._assgin_no_change_index(node, idx)
+                else:
+                    raise NotImplementedError(node.name, "method not implemented yet!")
+            elif node.op == "call_function":
+                if "linear" in node.name:
+                    self._assign_linear_index(node, idx)
+                elif "matmul" in node.name:
+                    self._assign_matmul_index(node, idx)
+                elif "softmax" in node.name:
+                    self._assign_softmax_index(node, idx)
+                elif any(n in node.name for n in ["mul", "add", "sigmoid", "relu"]):
+                    self._assign_elementwise_index(node, idx)
+                elif "ones_like" in node.name:
+                    self._assign_ones_like_index(node, idx)
+                elif "dropout" in node.name:
+                    self._assign_dropout_index(node, idx)
+                elif "einsum" in node.name:
+                    self._assign_einsum_index(node, idx)
+                elif "getattr" in node.name:
+                    continue  # get attr like shape
+                elif "getitem" in node.name:
+                    continue  # get item in list
+                else:
+                    raise NotImplementedError(
+                        node.name, "function not implemented yet!"
+                    )
+            elif node.op == "call_module":
+                if any(n in node.name for n in ["layernorm", "norm"]):
+                    self._assign_layernorm_index(node, idx)
+                else:
+                    raise NotImplementedError(node.name, "module not implemented yet!")
+            elif node.op == "get_attr":
+                self._assign_all_index(node, idx)  # get param
+            elif node.op == "output":
+                continue
+            else:
+                raise NotImplementedError(node.op, "op not implemented yet!")
+        # self._merge_equal_idx()
+
+    def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node):
+        """
+        Check 2 given index: one index should be source of the other
+        Args:
+            start_idx(int): start node chunk dim
+            start_node(node): start node
+            end_idx(int): end node chunk dim
+            end_node(node): end node
+
+        Returns:
+            bool: True if check pass
+        """
+        start_node_idx = find_idx_by_name(start_node.name, self.node_list)
+        end_node_trace = self._find_trace_from_node(end_node)
+        end_node_trace_source = end_node_trace["source"][end_dim]
+        sorted_source = sorted(
+            end_node_trace_source.items(), key=lambda d: d[0], reverse=True
+        )
+        for node_idx, node_dim in sorted_source:
+            if node_idx == start_node_idx and start_dim in node_dim:
+                return True
+            # it means we meet a node outside the loop, and the node is not input node
+            if node_idx < start_idx:
+                return False
+        return False
+
+    def check_index_compute(self, start_idx, end_dim, end_node, end_idx):
+        """
+        Check 2 given index: check they haven't been computed in the source trace.
+        Args:
+            start_idx(int): start node chunk dim
+            start_node(node): start node
+            end_idx(int): end node chunk dim
+            end_node(node): end node
+
+        Returns:
+            bool: True if check pass
+        """
+        end_node_trace = self._find_trace_from_node(end_node)
+        end_node_compute = end_node_trace["compute"][end_dim]
+        if any(start_idx <= i <= end_idx for i in end_node_compute):
+            return False
+        return True
+
+    def get_node_chunk_dim(self, node_from, node_from_dim, node_to):
+        node_from_source = self._find_source_trace_from_node(node_from)
+        dim_source = node_from_source[node_from_dim]
+        node_to_idx = find_idx_by_name(node_to.name, self.node_list)
+        for k, v in dim_source.items():
+            if k == node_to_idx:
+                return v
+        return None
+
+    def _find_inherit_dim(self, input_node, input_dim, node):
+        input_node_idx = find_idx_by_name(input_node.name, self.node_list)
+        node_trace_source = self._find_source_trace_from_node(node)
+        for node_dim in range(len(get_node_shape(node))):
+            if (
+                input_node_idx in node_trace_source[node_dim]
+                and input_dim[0] in node_trace_source[node_dim][input_node_idx]
+            ):
+                return node_dim
+        return None
+
+    def check_index_duplicate(self, chunk_infos, return_dim=False):
+        input_dim_after_node = {}
+        for input_node_idx, input_node in enumerate(chunk_infos["inputs"]):
+            for k, v in chunk_infos["inputs_dim"][input_node_idx].items():
+                inherit_dim = self._find_inherit_dim(input_node, v, self.node_list[k])
+                if inherit_dim:
+                    input_dim_after_node[k] = inherit_dim
+
+        for node in self.node_list[
+            chunk_infos["region"][0] : chunk_infos["region"][1] + 1
+        ]:
+            if is_non_compute_node_except_placeholder(node):
+                continue
+            count = 0
+            duplicate_dims = []
+            node_trace_source = self._find_source_trace_from_node(node)
+            for node_dim in range(len(get_node_shape(node))):
+                duplicate_dim = []
+                duplicate_flag = False
+                dim_source = node_trace_source[node_dim]
+                for k, v in dim_source.items():
+                    if chunk_infos["region"][0] <= k <= chunk_infos["region"][1]:
+                        if k in input_dim_after_node and input_dim_after_node[k] in v:
+                            duplicate_flag = True
+                            duplicate_dim.append((k, v))
+                duplicate_dims.append(duplicate_dim)
+                if duplicate_flag:
+                    count += 1
+
+            if count > 1:
+                if return_dim:
+                    return False, duplicate_dims
+                else:
+                    return False
+        if return_dim:
+            return True, None
+        else:
+            return True
+
+    def _assgin_single_node_flow(
+        self,
+        arg_node,
+        start_idx,
+        end_idx,
+        cur_node_dim,
+        cur_node_compute,
+        cur_node_source,
+        cur_node_fix_dim,
+        all_node_info,
+        next_node_list,
+    ):
+        arg_idx = find_idx_by_name(arg_node.name, self.node_list)
+        # arg in chunk range or be inputs
+        if not (start_idx <= arg_idx < end_idx):
+            return True
+
+        # find arg dim
+        if cur_node_dim is not None:
+            # dim is computed
+            if arg_idx in cur_node_compute[cur_node_dim]:
+                return False
+            if arg_idx not in cur_node_source[cur_node_dim]:
+                arg_dim = None
+            else:
+                arg_dim = cur_node_source[cur_node_dim][arg_idx][0]
+        else:
+            arg_dim = None
+
+        # get fix dim
+        arg_fix_dim = []
+        if cur_node_dim is not None:
+            for i in cur_node_fix_dim:
+                fix_dim_source = cur_node_source[i]
+                if arg_idx in fix_dim_source:
+                    arg_fix_dim.append(fix_dim_source[arg_idx][0])
+
+        # if already in node_info, arg dim must be same
+        if arg_node in all_node_info:
+            if all_node_info[arg_node]["chunk_dim"] != arg_dim:
+                return False
+            all_node_info[arg_node]["fix_dim"] = list(
+                set(all_node_info[arg_node]["fix_dim"] + arg_fix_dim)
+            )
+        # else add it to list
+        else:
+            all_node_info[arg_node] = {"chunk_dim": arg_dim, "fix_dim": arg_fix_dim}
+
+        next_node_list.append(arg_node)
+        return True
+
+    def flow_search(self, start_idx, start_dim, end_idx, end_dim):
+        inputs, outputs = find_chunk_compute_input_and_output_nodes(
+            self.node_list[start_idx : end_idx + 1]
+        )
+        # only single ouput
+        if len(outputs) > 1:
+            return None
+
+        cur_node_list = [self.node_list[end_idx]]  # start from the last node
+        all_node_info = {cur_node_list[0]: {"chunk_dim": end_dim, "fix_dim": []}}
+
+        while len(cur_node_list) > 0:
+            next_node_list = []
+
+            for cur_node in cur_node_list:
+                # get cur node info
+                cur_node_chunk_dim = all_node_info[cur_node]["chunk_dim"]
+                cur_node_fix_dim = all_node_info[cur_node]["fix_dim"]
+                cur_node_idx = find_idx_by_name(cur_node.name, self.node_list)
+                if cur_node_chunk_dim:
+                    cur_node_compute = self._find_compute_trace_from_node(cur_node)
+                    cur_node_source = self._find_source_trace_from_node(cur_node)
+                else:
+                    cur_node_compute = cur_node_source = None
+
+                # get all valid args
+                arg_list = []
+                for arg in cur_node.args:
+                    if type(arg) != type(cur_node):
+                        continue
+                    if is_non_compute_node(arg):
+                        continue
+                    arg_list.append(arg)
+                    flow_flag = self._assgin_single_node_flow(
+                        arg,
+                        start_idx,
+                        end_idx,
+                        cur_node_chunk_dim,
+                        cur_node_compute,
+                        cur_node_source,
+                        cur_node_fix_dim,
+                        all_node_info,
+                        next_node_list,
+                    )
+                    if flow_flag == False:
+                        return None
+
+                if len(arg_list) == 2:
+                    if any(i in cur_node.name for i in ["add", "mul"]):
+                        for arg in arg_list:
+                            if not (
+                                start_idx
+                                <= find_idx_by_name(arg.name, self.node_list)
+                                < end_idx
+                            ):
+                                continue
+                            arg_chunk_dim = all_node_info[arg]["chunk_dim"]
+                            arg_fix_dim = all_node_info[arg]["fix_dim"]
+                            arg_shape = get_node_shape(arg)
+                            # add all dim as fix dim except chunk dim
+                            for i, shape in enumerate(arg_shape):
+                                if shape != 1 and i != cur_node_chunk_dim:
+                                    if i == arg_chunk_dim:
+                                        return None
+                                    if i not in arg_fix_dim:
+                                        arg_fix_dim.append(i)
+                    elif "einsum" in cur_node.name:
+                        pass
+                    elif "matmul" in cur_node.name:
+                        pass
+                    else:
+                        raise NotImplementedError()
+            cur_node_list = next_node_list
+
+        inputs_dim = []
+        remove_inputs = []
+        for input_node in inputs:
+            input_dict = {}
+            input_node_idx = find_idx_by_name(input_node.name, self.node_list)
+            for user in input_node.users.keys():
+                if is_non_compute_node(user):
+                    continue
+                user_idx = find_idx_by_name(user.name, self.node_list)
+                if start_idx <= user_idx <= end_idx:
+                    chunk_dim = all_node_info[user]["chunk_dim"]
+                    if chunk_dim is not None:
+                        user_source = self._find_source_trace_from_node(user)[chunk_dim]
+                        if input_node_idx in user_source:
+                            input_dict[user_idx] = user_source[input_node_idx]
+                        else:
+                            return None
+            if len(input_dict) == 0:
+                remove_inputs.append(input_node)
+            else:
+                inputs_dim.append(input_dict)
+        for i in remove_inputs:
+            if i in inputs:
+                inputs.remove(i)
+
+        chunk_info = {
+            "region": (start_idx, end_idx),
+            "inputs": inputs,
+            "inputs_non_chunk": [],
+            "inputs_dim": inputs_dim,
+            "outputs": outputs,
+            "outputs_dim": end_dim,
+            "node_chunk_dim": all_node_info,
+            "args": {},
+        }
+
+        # move useless nodes ahead of loop
+        # get all possible prepose nodes
+        maybe_prepose_nodes = []
+        for node, node_info in all_node_info.items():
+            if node_info["chunk_dim"] is None:
+                maybe_prepose_nodes.append(node)
+        maybe_prepose_nodes.sort(
+            key=lambda x: find_idx_by_name(x.name, self.node_list),
+            reverse=True,
+        )  # from last node to first node
+        prepose_nodes = []
+        # set every node as root, search its args, if all legal, turn root and args as prepose nodes
+        while len(maybe_prepose_nodes) > 0:
+            tmp_cur_prepose_nodes = [maybe_prepose_nodes[0]]
+            tmp_cur_related_prepose_nodes = []
+            prepose_flag = True
+
+            # loop cur node's all arg until out of chunk
+            while len(tmp_cur_prepose_nodes) > 0:
+                if prepose_flag == False:
+                    break
+                tmp_next_prepose_nodes = []
+                tmp_cur_related_prepose_nodes.extend(tmp_cur_prepose_nodes)
+                for cur_prepose_node in tmp_cur_prepose_nodes:
+                    if prepose_flag == False:
+                        break
+                    for cur_prepose_node_arg in cur_prepose_node.args:
+                        if type(cur_prepose_node_arg) != type(cur_prepose_node):
+                            continue
+                        # out of loop
+                        if not (
+                            start_idx
+                            <= find_idx_by_name(
+                                cur_prepose_node_arg.name, self.node_list
+                            )
+                            < end_idx
+                        ):
+                            continue
+                        # compute op in loop
+                        elif cur_prepose_node_arg in all_node_info:
+                            if all_node_info[cur_prepose_node_arg]["chunk_dim"] is None:
+                                tmp_next_prepose_nodes.append(cur_prepose_node_arg)
+                            else:
+                                prepose_flag = False
+                                break
+                        # non compute op
+                        else:
+                            tmp_next_prepose_nodes.append(cur_prepose_node_arg)
+                tmp_cur_prepose_nodes = tmp_next_prepose_nodes
+
+            if prepose_flag == False:
+                maybe_prepose_nodes.remove(maybe_prepose_nodes[0])
+                continue
+            else:
+                for n in tmp_cur_related_prepose_nodes:
+                    if n not in prepose_nodes:
+                        prepose_nodes.append(n)
+                    if n in maybe_prepose_nodes:
+                        maybe_prepose_nodes.remove(n)
+        # sort by index
+        prepose_nodes.sort(key=lambda x: find_idx_by_name(x.name, self.node_list))
+        chunk_info["args"]["prepose_nodes"] = prepose_nodes
+
+        # we need to log input nodes to avoid deleteing them in the loop
+        chunk_node_list = self.node_list[start_idx : end_idx + 1]
+        # also need to get some prepose node's arg out of non_chunk_inputs
+        for n in prepose_nodes:
+            chunk_node_list.remove(n)
+        non_chunk_inputs = find_chunk_all_input_nodes(chunk_node_list)
+        for i in non_chunk_inputs:
+            if i not in chunk_info["inputs"]:
+                chunk_info["inputs_non_chunk"].append(i)
+
+        # reassgin reshape size, some size may have changed due to chunk
+        chunk_info = self._reassgin_reshape_size(chunk_info)
+
+        return chunk_info
+
+    def _reassgin_reshape_size(self, chunk_info):
+        chunk_region = chunk_info["region"]
+        reshape_size = {}
+        chunk_shape = get_node_shape(chunk_info["outputs"][0])[
+            chunk_info["outputs_dim"]
+        ]
+        for node in self.node_list[chunk_region[0] : chunk_region[1] + 1]:
+            if any(i in node.name for i in ["reshape", "view"]):
+                reshape_args = node.args[1:]
+                reshape_log = self.idx_view_list[node]
+                chunk_dim = chunk_info["node_chunk_dim"][node]["chunk_dim"]
+                reshape_size[node.name] = {}
+                for reshape_arg_dim, reshape_arg in enumerate(reshape_args):
+                    if reshape_arg_dim in reshape_log["dim_to"]:
+                        continue
+                    if reshape_arg_dim == chunk_dim:
+                        reshape_size[node.name][reshape_arg.name] = (
+                            "min(chunk_size, %d - chunk_idx)" % chunk_shape
+                        )
+        chunk_info["reshape_size"] = reshape_size
+        return chunk_info
+
+    def _get_reorder_map(self, chunk_info):
+        reorder_map = {i: i for i in range(len(self.node_list))}
+
+        chunk_region_start = chunk_info["region"][0]
+        chunk_region_end = chunk_info["region"][1]
+        chunk_prepose_nodes = chunk_info["args"]["prepose_nodes"]
+        chunk_prepose_nodes_idx = [
+            find_idx_by_name(i.name, self.node_list) for i in chunk_prepose_nodes
+        ]
+        # put prepose nodes ahead
+        for idx, n in enumerate(chunk_prepose_nodes):
+            n_idx = chunk_prepose_nodes_idx[idx]
+            reorder_map[n_idx] = chunk_region_start + idx
+        # put other nodes after prepose nodes
+        for n in self.node_list[chunk_region_start : chunk_region_end + 1]:
+            if n in chunk_prepose_nodes:
+                continue
+            n_idx = find_idx_by_name(n.name, self.node_list)
+            pos = sum([n_idx < i for i in chunk_prepose_nodes_idx])
+            reorder_map[n_idx] = n_idx + pos
+
+        return reorder_map
+
+    def _reorder_chunk_info(self, chunk_info, reorder_map):
+        # update chunk info
+        chunk_info["region"] = (
+            chunk_info["region"][0] + len(chunk_info["args"]["prepose_nodes"]),
+            chunk_info["region"][1],
+        )
+        new_inputs_dim = []
+        for idx, input_dim in enumerate(chunk_info["inputs_dim"]):
+            new_input_dim = {}
+            for k, v in input_dim.items():
+                new_input_dim[reorder_map[k]] = v
+            new_inputs_dim.append(new_input_dim)
+        chunk_info["inputs_dim"] = new_inputs_dim
+        return chunk_info
+
+    def _update_all_reorder_map(self, reorder_map):
+        for origin_idx, map_idx in self.all_reorder_map.items():
+            self.all_reorder_map[origin_idx] = reorder_map[map_idx]
+
+    def _reorder_self_node_list(self, reorder_map):
+        new_node_list = [None for _ in range(len(self.node_list))]
+        for old_idx, new_idx in reorder_map.items():
+            new_node_list[new_idx] = self.node_list[old_idx]
+        self.node_list = new_node_list
+
+    def _reorder_idx_trace(self, reorder_map):
+        # reorder list
+        new_idx_trace_list = [None for _ in range(len(self.idx_trace_list))]
+        for old_idx, new_idx in reorder_map.items():
+            new_idx_trace_list[new_idx] = self.idx_trace_list[old_idx]
+        self.idx_trace_list = new_idx_trace_list
+        # update compute
+        for idx_trace in self.idx_trace_list:
+            compute = idx_trace["compute"]
+            for dim_compute in compute:
+                for idx, i in enumerate(dim_compute):
+                    dim_compute[idx] = reorder_map[i]
+        # update source
+        for idx_trace in self.idx_trace_list:
+            source = idx_trace["source"]
+            for dim_idx, dim_source in enumerate(source):
+                new_dim_source = {}
+                for k, v in dim_source.items():
+                    new_dim_source[reorder_map[k]] = v
+                source[dim_idx] = new_dim_source
+
+    def reorder_all(self, chunk_info):
+        if chunk_info is None:
+            return chunk_info
+        if len(chunk_info["args"]["prepose_nodes"]) == 0:
+            return chunk_info
+        reorder_map = self._get_reorder_map(chunk_info)
+        self._update_all_reorder_map(reorder_map)
+        self._reorder_idx_trace(reorder_map)
+        self._reorder_self_node_list(reorder_map)
+        chunk_info = self._reorder_chunk_info(chunk_info, reorder_map)
+        return chunk_info
+
+    def reorder_node_list(self, node_list):
+        new_node_list = [None for _ in range(len(node_list))]
+        for old_idx, new_idx in self.all_reorder_map.items():
+            new_node_list[new_idx] = node_list[old_idx]
+        return new_node_list
+
+    def tmp_reorder(self, node_list, chunk_info):
+        if len(chunk_info["args"]["prepose_nodes"]) == 0:
+            return node_list, chunk_info
+        reorder_map = self._get_reorder_map(chunk_info)
+
+        # new tmp node list
+        new_node_list = [None for _ in range(len(node_list))]
+        for old_idx, new_idx in reorder_map.items():
+            new_node_list[new_idx] = node_list[old_idx]
+
+        chunk_info = self._reorder_chunk_info(chunk_info, reorder_map)
+        return new_node_list, chunk_info
diff --git a/colossalai/autochunk/memory_estiamtor.py b/colossalai/autochunk/memory_estiamtor.py
new file mode 100644
index 000000000000..c3d8b1803ce9
--- /dev/null
+++ b/colossalai/autochunk/memory_estiamtor.py
@@ -0,0 +1,318 @@
+import copy
+from typing import Any, Callable, Dict, Iterable, List, Tuple
+
+import torch
+from torch.fx.node import Node, map_arg
+
+from colossalai.fx.profiler import activation_size, parameter_size
+
+from .index_tracer import IndexTracer
+from .utils import (
+    delete_free_var_from_last_use,
+    find_idx_by_name,
+    get_node_shape,
+    is_non_compute_node_except_placeholder,
+)
+
+
+class MemoryEstimator(object):
+    def __init__(self, index_tracer: IndexTracer) -> None:
+        pass
+
+    def _get_meta_node_size(self, x):
+        x = x.meta["tensor_meta"]
+        x = x.numel * torch.tensor([], dtype=x.dtype).element_size()
+        return x
+
+    def _get_output_node(self, n):
+        fwd_out = {
+            x.uuid: x
+            for x in n.meta["fwd_out"]
+            if isinstance(x, torch.Tensor) and hasattr(x, "uuid")
+        }
+        out_size = activation_size(fwd_out)
+        out_node = [n.name] if out_size > 0 else []
+        # if any(i in n.name for i in ['transpose', 'permute', 'view']):
+        #     out_size = 0
+        return out_size, out_node
+
+    def _get_output_node_size(self, n):
+        return self._get_output_node(n)[0]
+
+    def _add_active_node(self, n, active_list):
+        new_active = self._get_output_node(n)[1]
+        if n.op == "placeholder":
+            new_active.append(n.name)
+        for i in new_active:
+            if i not in active_list:
+                active_list.append(i)
+
+    def _get_delete_node(self, user, user_to_last_uses, to_keep=None):
+        delete_size = 0
+        delete_node = []
+        if user.op not in ("output",):
+            nodes_to_delete = user_to_last_uses.get(user, [])
+            if to_keep is not None:
+                keep_list = []
+                for n in nodes_to_delete:
+                    if n.name in to_keep:
+                        keep_list.append(n)
+                for n in keep_list:
+                    if n in nodes_to_delete:
+                        nodes_to_delete.remove(n)
+            if len(nodes_to_delete):
+                out_node = [self._get_output_node(i) for i in nodes_to_delete]
+                delete_size = sum([i[0] for i in out_node])
+                for i in range(len(out_node)):
+                    if out_node[i][0] > 0:
+                        delete_node.append(out_node[i][1][0])
+                    elif nodes_to_delete[i].op == "placeholder":
+                        delete_node.append(nodes_to_delete[i].name)
+                    # elif any(j in nodes_to_delete[i].name for j in ['transpose', 'permute', 'view']):
+                    #     delete_node.append(nodes_to_delete[i].name)
+        return delete_size, delete_node
+
+    def _get_delete_node_size(self, user, user_to_last_uses, to_keep):
+        return self._get_delete_node(user, user_to_last_uses, to_keep)[0]
+
+    def _remove_deactive_node(self, user, user_to_last_uses, active_list):
+        delete_node = self._get_delete_node(user, user_to_last_uses)[1]
+        for i in delete_node:
+            if i in active_list:
+                active_list.remove(i)
+
+    def _get_chunk_inputs_size(
+        self, chunk_inputs, chunk_inputs_non_chunk, node_list, chunk_end_idx
+    ):
+        nodes_to_delete = []
+        for chunk_input in chunk_inputs + chunk_inputs_non_chunk:
+            chunk_input_users = chunk_input.users.keys()
+            chunk_input_users_idx = [
+                find_idx_by_name(i.name, node_list) for i in chunk_input_users
+            ]
+            if all(i <= chunk_end_idx for i in chunk_input_users_idx):
+                if chunk_input not in nodes_to_delete:
+                    nodes_to_delete.append(chunk_input)
+        out_node = [self._get_output_node(i) for i in nodes_to_delete]
+        delete_size = sum([i[0] for i in out_node])
+        return delete_size
+
+    def _get_last_usr(self, nodes):
+        node_to_last_use: Dict[Node, Node] = {}
+        user_to_last_uses: Dict[Node, List[Node]] = {}
+
+        def register_last_uses(n: Node, user: Node):
+            if n not in node_to_last_use:
+                node_to_last_use[n] = user
+                user_to_last_uses.setdefault(user, []).append(n)
+
+        for node in reversed(nodes):
+            map_arg(node.args, lambda n: register_last_uses(n, node))
+            map_arg(node.kwargs, lambda n: register_last_uses(n, node))
+        return user_to_last_uses
+
+    def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
+        mem = 0
+        not_contiguous_ops = ["permute"]
+        inherit_contiguous_ops = ["transpose", "view"]
+
+        if node.op == "call_function" and any(
+            n in node.name for n in ["matmul", "reshape"]
+        ):
+            for n in node.args:
+                if n in not_contiguous_list:
+                    # matmul won't change origin tensor, but create a tmp copy
+                    mem += self._get_output_node_size(n)
+        elif node.op == "call_module":
+            for n in node.args:
+                if n in not_contiguous_list:
+                    # module will just make origin tensor to contiguous
+                    if delete:
+                        not_contiguous_list.remove(n)
+        elif node.op == "call_method" and any(
+            i in node.name for i in not_contiguous_ops
+        ):
+            if node not in not_contiguous_list:
+                not_contiguous_list.append(node)
+        return mem
+
+    def _get_chunk_ratio(self, node, chunk_node_dim, chunk_size):
+        if node not in chunk_node_dim:
+            return 1.0
+        node_shape = get_node_shape(node)
+        chunk_dim = chunk_node_dim[node]["chunk_dim"]
+        if chunk_dim is None:
+            return 1.0
+        else:
+            return float(chunk_size) / node_shape[chunk_dim]
+
+    def _get_chunk_delete_node_size(
+        self, user, user_to_last_uses, chunk_ratio, chunk_inputs_names
+    ):
+        # if any(j in user.name for j in ['transpose', 'permute', 'view']):
+        #     return 0
+        if user.op in ("placeholder", "output"):
+            return 0
+        nodes_to_delete = user_to_last_uses.get(user, [])
+        delete_size = 0
+        for n in nodes_to_delete:
+            if n.name in chunk_inputs_names:
+                continue
+            delete_size += self._get_output_node_size(n) * chunk_ratio
+        return delete_size
+
+    def _print_mem_log(self, log, nodes, title=None):
+        if title:
+            print(title)
+        for idx, (l, n) in enumerate(zip(log, nodes)):
+            print("%s:%.2f \t" % (n.name, l), end="")
+            if (idx + 1) % 3 == 0:
+                print("")
+        print("\n")
+
+    def _print_compute_op_mem_log(self, log, nodes, title=None):
+        if title:
+            print(title)
+        for idx, (l, n) in enumerate(zip(log, nodes)):
+            if n.op in ["placeholder", "get_attr", "output"]:
+                continue
+            if any(i in n.name for i in ["getitem", "getattr"]):
+                continue
+            print("%s:%.2f \t" % (n.name, l), end="")
+            if (idx + 1) % 3 == 0:
+                print("")
+        print("\n")
+
+    def estimate_chunk_inference_mem(
+        self,
+        node_list,
+        chunk_infos=None,
+        print_mem=False,
+    ):
+        act_memory = 0.0
+        act_memory_peak_log = []
+        act_memory_after_node_log = []
+        active_node_list = []
+        active_node_list_log = []
+        not_contiguous_list = []
+        user_to_last_uses = self._get_last_usr(node_list)
+        user_to_last_uses_no_free_var = self._get_last_usr(node_list)
+        delete_free_var_from_last_use(user_to_last_uses_no_free_var)
+
+        use_chunk = True if chunk_infos is not None else False
+        chunk_within = False
+        chunk_region_idx = None
+        chunk_ratio = 1  # use it to estimate chunk mem
+        chunk_inputs_names = []
+
+        if use_chunk:
+            chunk_regions = [i["region"] for i in chunk_infos]
+            chunk_starts = [i[0] for i in chunk_regions]
+            chunk_ends = [i[1] for i in chunk_regions]
+            chunk_inputs = [i["inputs"] for i in chunk_infos]
+            chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos]
+            chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [
+                j.name for i in chunk_inputs_non_chunk for j in i
+            ]
+            chunk_outputs = [i["outputs"][0] for i in chunk_infos]
+            chunk_node_dim = [i["node_chunk_dim"] for i in chunk_infos]
+            chunk_sizes = [
+                i["chunk_size"] if "chunk_size" in i else 1 for i in chunk_infos
+            ]
+
+        for idx, node in enumerate(node_list):
+            # if node in chunk start nodes, change chunk ratio and add chunk_tensor
+            if use_chunk and idx in chunk_starts:
+                chunk_within = True
+                chunk_region_idx = chunk_starts.index(idx)
+                act_memory += self._get_output_node_size(
+                    chunk_outputs[chunk_region_idx]
+                ) / (1024**2)
+
+            # determine chunk ratio for current node
+            if chunk_within:
+                chunk_ratio = self._get_chunk_ratio(
+                    node,
+                    chunk_node_dim[chunk_region_idx],
+                    chunk_sizes[chunk_region_idx],
+                )
+
+            # if node is placeholder, just add the size of the node
+            if node.op == "placeholder":
+                act_memory += self._get_meta_node_size(node) * chunk_ratio / (1024**2)
+                act_memory_peak_log.append(act_memory)
+            # skip output
+            elif node.op == "output":
+                continue
+            # no change for non compute node
+            elif is_non_compute_node_except_placeholder(node):
+                act_memory_peak_log.append(act_memory)
+            # node is a compute op
+            # calculate tmp, output node and delete node memory
+            else:
+                # forward memory
+                # TODO: contiguous_memory still not accurate for matmul, view, reshape and transpose
+                act_memory += (
+                    self._get_contiguous_memory(node, not_contiguous_list)
+                    * chunk_ratio
+                    / (1024**2)
+                )
+                act_memory += (
+                    self._get_output_node_size(node) * chunk_ratio / (1024**2)
+                )
+                # record max act memory
+                act_memory_peak_log.append(act_memory)
+                # delete useless memory
+                act_memory -= (
+                    self._get_contiguous_memory(node, not_contiguous_list, delete=True)
+                    * chunk_ratio
+                    / (1024**2)
+                )
+                # delete unused vars not in chunk_input_list
+                # we can't delete input nodes until chunk ends
+                if chunk_within:
+                    act_memory -= self._get_chunk_delete_node_size(
+                        node,
+                        user_to_last_uses_no_free_var,
+                        chunk_ratio,
+                        chunk_inputs_names,
+                    ) / (1024**2)
+                else:
+                    act_memory -= self._get_delete_node_size(
+                        node, user_to_last_uses_no_free_var, chunk_inputs_names
+                    ) / (1024**2)
+
+            # log active node, only effective without chunk
+            self._add_active_node(node, active_node_list)
+            self._remove_deactive_node(node, user_to_last_uses, active_node_list)
+
+            # if node in chunk end nodes, restore chunk settings
+            if use_chunk and idx in chunk_ends:
+                act_memory -= (
+                    self._get_output_node_size(node) * chunk_ratio / (1024**2)
+                )
+                act_memory -= self._get_chunk_inputs_size(
+                    chunk_inputs[chunk_region_idx],
+                    chunk_inputs_non_chunk[chunk_region_idx],
+                    node_list,
+                    chunk_regions[chunk_region_idx][1],
+                ) / (1024**2)
+                chunk_within = False
+                chunk_ratio = 1
+                chunk_region_idx = None
+
+            act_memory_after_node_log.append(act_memory)
+            active_node_list_log.append(copy.deepcopy(active_node_list))
+
+        if print_mem:
+            print("with chunk" if use_chunk else "without chunk")
+            # self._print_mem_log(act_memory_peak_log, node_list, "peak")
+            # self._print_mem_log(act_memory_after_node_log, node_list, "after")
+            self._print_compute_op_mem_log(act_memory_peak_log, node_list, "peak")
+            # self._print_compute_op_mem_log(
+            #     act_memory_after_node_log, node_list, "after"
+            # )
+
+        # param_memory = parameter_size(gm)
+        # all_memory = act_memory + param_memory
+        return act_memory_peak_log, act_memory_after_node_log, active_node_list_log
diff --git a/colossalai/autochunk/utils.py b/colossalai/autochunk/utils.py
new file mode 100644
index 000000000000..b62a6600adc8
--- /dev/null
+++ b/colossalai/autochunk/utils.py
@@ -0,0 +1,95 @@
+from typing import Any, Callable, Dict, Iterable, List, Tuple
+
+from torch.fx.node import Node
+
+
+def is_non_compute_node(node):
+    if any(i in node.op for i in ["placeholder", "get_attr", "output"]) or any(
+        i in node.name for i in ["getitem", "getattr"]
+    ):
+        return True
+    return False
+
+
+def get_node_shape(node):
+    if hasattr(node.meta["tensor_meta"], "shape"):
+        return node.meta["tensor_meta"].shape
+    return None
+
+
+def is_non_compute_node_except_placeholder(node):
+    if any(i in node.op for i in ["get_attr", "output"]) or any(
+        i in node.name for i in ["getitem", "getattr"]
+    ):
+        return True
+    return False
+
+
+def is_non_compute_node_except_placeholder_output(node):
+    if any(i in node.op for i in ["get_attr"]) or any(
+        i in node.name for i in ["getitem", "getattr"]
+    ):
+        return True
+    return False
+
+
+def find_idx_by_name(name, nodes_list):
+    for idx, node in enumerate(nodes_list):
+        if node.name == name:
+            return idx
+    raise RuntimeError("name %s not found in node list" % name)
+
+
+def delete_free_var_from_last_use(user_to_last_uses):
+    for key, value in user_to_last_uses.items():
+        for n in value:
+            if n.op == "placeholder":
+                user_to_last_uses[key].remove(n)
+
+
+def find_chunk_all_input_nodes(nodes: List[Node]):
+    """
+    Find non-compute input and output node names.
+    input nodes are nodes used in the list
+    output nodes are nodes will use nodes in the list
+    """
+    input_nodes = []
+    for node in nodes:
+        for input_node in node._input_nodes.keys():
+            if input_node not in nodes and input_node not in input_nodes:
+                input_nodes.append(input_node)
+    return input_nodes
+
+
+def find_chunk_compute_input_and_output_nodes(nodes: List[Node]):
+    """
+    Find non-compute input and output node names.
+    input nodes are nodes used in the list
+    output nodes are nodes will use nodes in the list
+    """
+    input_nodes = []
+    output_nodes = []
+
+    # if a node has an input node which is not in the node list
+    # we treat that input node as the input of the checkpoint function
+    for node in nodes:
+        for input_node in node._input_nodes.keys():
+            if (
+                input_node not in nodes
+                and input_node not in input_nodes
+                and not is_non_compute_node_except_placeholder(input_node)
+            ):
+                input_nodes.append(input_node)
+
+    # if a node has a user node which is not in the node list
+    # we treat that user node as the node receiving the current node output
+    for node in nodes:
+        for output_node in node.users.keys():
+            if (
+                output_node not in nodes
+                and node not in output_nodes
+                and not is_non_compute_node_except_placeholder_output(output_node)
+            ):
+                output_nodes.append(node)
+
+    return input_nodes, output_nodes
diff --git a/tests/test_autochunk/benchmark_autochunk.py b/tests/test_autochunk/benchmark_autochunk.py
index 8df6d9ff4564..702eb7026bb7 100644
--- a/tests/test_autochunk/benchmark_autochunk.py
+++ b/tests/test_autochunk/benchmark_autochunk.py
@@ -3,7 +3,7 @@
 import torch
 import torch.fx
 
-from colossalai.autochunk.chunk_codegen import ChunkCodeGen
+from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
 from colossalai.fx import ColoTracer
 from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
@@ -49,25 +49,29 @@ def _build_autochunk(model, max_memory, node, pair):
             "pair": pair.to(torch.device("meta")),
         },
     )
+
     gm_prop = torch.fx.symbolic_trace(model)  # must use symbolic_trace
     interp = MetaInfoProp(gm_prop)
     interp.propagate(
         MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
     )
+
     # now run it twice to get meta info in graph module, not necessary
     gm = torch.fx.GraphModule(model, graph)
     interp = MetaInfoProp(gm)
     interp.propagate(
         MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
     )
+
     # set code_gen
-    codegen = ChunkCodeGen(gm_prop, max_memory)
+    codegen = AutoChunkCodeGen(gm_prop, max_memory)
     graph.set_codegen(codegen)
     gm = ColoGraphModule(model, graph)
     gm.recompile()
+
     # print
-    code = graph.python_code("self").src
-    print(code)
+    # code = graph.python_code("self").src
+    # print(code)
     return gm
 
 
diff --git a/tests/test_autochunk/test_autochunk.py b/tests/test_autochunk/test_autochunk.py
index caa2d9a80254..85a162084cc9 100644
--- a/tests/test_autochunk/test_autochunk.py
+++ b/tests/test_autochunk/test_autochunk.py
@@ -4,7 +4,7 @@
 import torch.multiprocessing as mp
 
 import colossalai
-from colossalai.autochunk.chunk_codegen import ChunkCodeGen
+from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
 from colossalai.core import global_context as gpc
 from colossalai.fx import ColoTracer
 from colossalai.fx.graph_module import ColoGraphModule
@@ -82,7 +82,7 @@ def _run_offload_codegen(rank):
         MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
     )
 
-    codegen = ChunkCodeGen(gm_prop)
+    codegen = AutoChunkCodeGen(gm_prop)
     graph.set_codegen(codegen)
     gm = ColoGraphModule(model, graph)
     gm.recompile()

From 8a634af2f5510954e7a992c0ee894d22cf9e26d2 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 14:19:45 +0800
Subject: [PATCH 087/209] close mem and code print

---
 colossalai/autochunk/autochunk_codegen.py   |  4 ++--
 colossalai/autochunk/chunk_region_search.py | 11 +++++++----
 tests/test_autochunk/benchmark_autochunk.py |  2 +-
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 58a8c375136e..dcc6bba9ed0a 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -214,13 +214,13 @@ def emit_code_with_chunk(
 if CODEGEN_AVAILABLE:
 
     class AutoChunkCodeGen(CodeGen):
-        def __init__(self, meta_graph, max_memory=None):
+        def __init__(self, meta_graph, max_memory=None, print_mem=False):
             super().__init__()
             self.meta_graph = meta_graph
             self.max_memory = max_memory
             self.meta_node = list(meta_graph.graph.nodes)
             # find the chunk regions
-            self.chunk_region_search = ChunkRegionSearch(meta_graph, max_memory)
+            self.chunk_region_search = ChunkRegionSearch(meta_graph, max_memory, print_mem)
             self.chunk_infos = self.chunk_region_search.search_region()
 
         def _gen_python_code(
diff --git a/colossalai/autochunk/chunk_region_search.py b/colossalai/autochunk/chunk_region_search.py
index 0d0825f2584e..76b02cadeb3b 100644
--- a/colossalai/autochunk/chunk_region_search.py
+++ b/colossalai/autochunk/chunk_region_search.py
@@ -6,8 +6,9 @@
 
 
 class ChunkRegionSearch(object):
-    def __init__(self, gm, max_memory=None) -> None:
+    def __init__(self, gm, max_memory=None, print_mem=False) -> None:
         self.gm = gm
+        self.print_mem = print_mem
         self.index_tracer = IndexTracer(list(gm.graph.nodes))
         self.index_tracer.trace_index()
         self.memory_estimator = MemoryEstimator(self.index_tracer)
@@ -204,8 +205,10 @@ def search_region(self):
             )
             if self._stop_search(init_mem_peak, mem_peak):
                 break
-        self.memory_estimator.estimate_chunk_inference_mem(
-            self.index_tracer.node_list, chunk_infos, print_mem=True
-        )
+        if self.print_mem:
+            self.print_mem = False
+            self.memory_estimator.estimate_chunk_inference_mem(
+                self.index_tracer.node_list, chunk_infos, print_mem=True
+            )
         return chunk_infos
 
diff --git a/tests/test_autochunk/benchmark_autochunk.py b/tests/test_autochunk/benchmark_autochunk.py
index 702eb7026bb7..9daaa364a710 100644
--- a/tests/test_autochunk/benchmark_autochunk.py
+++ b/tests/test_autochunk/benchmark_autochunk.py
@@ -64,7 +64,7 @@ def _build_autochunk(model, max_memory, node, pair):
     )
 
     # set code_gen
-    codegen = AutoChunkCodeGen(gm_prop, max_memory)
+    codegen = AutoChunkCodeGen(gm_prop, max_memory, print_mem=False)
     graph.set_codegen(codegen)
     gm = ColoGraphModule(model, graph)
     gm.recompile()

From 2bde9d2b7fd43f3160088b820d926301f6527ebf Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 14:21:49 +0800
Subject: [PATCH 088/209] code format

---
 colossalai/autochunk/autochunk_codegen.py   |  4 +++-
 colossalai/autochunk/chunk_region_search.py | 14 +++++++++-----
 colossalai/autochunk/memory_estiamtor.py    |  2 +-
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index dcc6bba9ed0a..fbd5d5e368dc 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -220,7 +220,9 @@ def __init__(self, meta_graph, max_memory=None, print_mem=False):
             self.max_memory = max_memory
             self.meta_node = list(meta_graph.graph.nodes)
             # find the chunk regions
-            self.chunk_region_search = ChunkRegionSearch(meta_graph, max_memory, print_mem)
+            self.chunk_region_search = ChunkRegionSearch(
+                meta_graph, max_memory, print_mem
+            )
             self.chunk_infos = self.chunk_region_search.search_region()
 
         def _gen_python_code(
diff --git a/colossalai/autochunk/chunk_region_search.py b/colossalai/autochunk/chunk_region_search.py
index 76b02cadeb3b..7a0e8a36cd6c 100644
--- a/colossalai/autochunk/chunk_region_search.py
+++ b/colossalai/autochunk/chunk_region_search.py
@@ -1,8 +1,13 @@
+import copy
+
+from .chunk_selector import ChunkSelector
 from .index_tracer import IndexTracer
 from .memory_estiamtor import MemoryEstimator
-from .chunk_selector import ChunkSelector
-import copy
-from .utils import is_non_compute_node, is_non_compute_node_except_placeholder, get_node_shape
+from .utils import (
+    get_node_shape,
+    is_non_compute_node,
+    is_non_compute_node_except_placeholder,
+)
 
 
 class ChunkRegionSearch(object):
@@ -11,7 +16,7 @@ def __init__(self, gm, max_memory=None, print_mem=False) -> None:
         self.print_mem = print_mem
         self.index_tracer = IndexTracer(list(gm.graph.nodes))
         self.index_tracer.trace_index()
-        self.memory_estimator = MemoryEstimator(self.index_tracer)
+        self.memory_estimator = MemoryEstimator()
         self.chunk_selector = ChunkSelector(
             self.index_tracer, self.memory_estimator, max_memory=max_memory
         )
@@ -211,4 +216,3 @@ def search_region(self):
                 self.index_tracer.node_list, chunk_infos, print_mem=True
             )
         return chunk_infos
-
diff --git a/colossalai/autochunk/memory_estiamtor.py b/colossalai/autochunk/memory_estiamtor.py
index c3d8b1803ce9..034f59e52858 100644
--- a/colossalai/autochunk/memory_estiamtor.py
+++ b/colossalai/autochunk/memory_estiamtor.py
@@ -16,7 +16,7 @@
 
 
 class MemoryEstimator(object):
-    def __init__(self, index_tracer: IndexTracer) -> None:
+    def __init__(self) -> None:
         pass
 
     def _get_meta_node_size(self, x):

From fd87d78a28a70fcb840c16d4084f67926ecc309c Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 14:28:04 +0800
Subject: [PATCH 089/209] rename ambiguous variable

---
 colossalai/autochunk/chunk_selector.py        | 14 +++++++-------
 tests/test_autochunk/evoformer/ops.py         |  6 +++---
 tests/test_autochunk/openfold/tensor_utils.py |  8 ++++----
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/colossalai/autochunk/chunk_selector.py b/colossalai/autochunk/chunk_selector.py
index f84322082cc4..aeab66572099 100644
--- a/colossalai/autochunk/chunk_selector.py
+++ b/colossalai/autochunk/chunk_selector.py
@@ -126,14 +126,14 @@ def _get_fit_chunk_size(self, chunk_region_dict, chunk_infos):
         )
         return chunk_info
 
-    def _chunk_size_binary_search(self, l, r, chunk_region_dict, chunk_infos):
-        if l >= 16:
+    def _chunk_size_binary_search(self, left, right, chunk_region_dict, chunk_infos):
+        if left >= 16:
             gap = 4
         else:
             gap = 1
         chunk_info = chunk_region_dict["reorder_chunk_info"]
-        while r >= l + gap:
-            mid = int((l + r) / 2 + 0.5)
+        while right >= left + gap:
+            mid = int((left + right) / 2 + 0.5)
             chunk_info["chunk_size"] = mid
             cur_chunk_infos = chunk_infos + [chunk_info]
             cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
@@ -143,10 +143,10 @@ def _chunk_size_binary_search(self, l, r, chunk_region_dict, chunk_infos):
                 cur_mem_peak[chunk_info["region"][0] : chunk_info["region"][1] + 1]
             )
             if cur_chunk_max_mem >= self.max_memory:
-                r = mid - gap
+                right = mid - gap
             else:
-                l = mid + gap
-        return l
+                left = mid + gap
+        return left
 
     def _get_compute_node_num(self, start, end):
         count = 0
diff --git a/tests/test_autochunk/evoformer/ops.py b/tests/test_autochunk/evoformer/ops.py
index 611b7b0fe777..a56057522eaa 100755
--- a/tests/test_autochunk/evoformer/ops.py
+++ b/tests/test_autochunk/evoformer/ops.py
@@ -67,10 +67,10 @@ def forward(self, M):
         left_act = self.linear_a(M)
         right_act = self.linear_b(M)
 
-        O = torch.einsum('bsid,bsje->bijde', left_act, right_act).contiguous()
+        o = torch.einsum('bsid,bsje->bijde', left_act, right_act).contiguous()
         # O = rearrange(O, 'b i j d e -> b i j (d e)')
-        O = O.reshape(O.shape[0], O.shape[1], O.shape[2], -1)
-        Z = self.o_linear(O)
+        o = o.reshape(o.shape[0], o.shape[1], o.shape[2], -1)
+        Z = self.o_linear(o)
 
         return Z
 
diff --git a/tests/test_autochunk/openfold/tensor_utils.py b/tests/test_autochunk/openfold/tensor_utils.py
index 7e5e8e4b6b5e..384a71fb5ffd 100644
--- a/tests/test_autochunk/openfold/tensor_utils.py
+++ b/tests/test_autochunk/openfold/tensor_utils.py
@@ -157,12 +157,12 @@ def _get_minimal_slice_set(
     # start_edges and end_edges both indicate whether, starting from any given
     # dimension, the start/end index is at the top/bottom edge of the
     # corresponding tensor, modeled as a tree
-    def reduce_edge_list(l):
+    def reduce_edge_list(ll):
         tally = 1
-        for i in range(len(l)):
+        for i in range(len(ll)):
             reversed_idx = -1 * (i + 1)
-            l[reversed_idx] *= tally
-            tally = l[reversed_idx]
+            ll[reversed_idx] *= tally
+            tally = ll[reversed_idx]
 
     if(start_edges is None):
         start_edges = [s == 0 for s in start]

From ae27a8b26d7a36a3d9215fc6fd1db92982bdeef7 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 14:57:33 +0800
Subject: [PATCH 090/209] seperate flow tracer

---
 colossalai/autochunk/index_tracer.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/colossalai/autochunk/index_tracer.py b/colossalai/autochunk/index_tracer.py
index 7a86f3c998fb..0323e3a7e07d 100644
--- a/colossalai/autochunk/index_tracer.py
+++ b/colossalai/autochunk/index_tracer.py
@@ -745,14 +745,7 @@ def _assgin_single_node_flow(
         next_node_list.append(arg_node)
         return True
 
-    def flow_search(self, start_idx, start_dim, end_idx, end_dim):
-        inputs, outputs = find_chunk_compute_input_and_output_nodes(
-            self.node_list[start_idx : end_idx + 1]
-        )
-        # only single ouput
-        if len(outputs) > 1:
-            return None
-
+    def _get_all_node_info(self, end_dim, start_idx, end_idx):
         cur_node_list = [self.node_list[end_idx]]  # start from the last node
         all_node_info = {cur_node_list[0]: {"chunk_dim": end_dim, "fix_dim": []}}
 
@@ -763,7 +756,6 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
                 # get cur node info
                 cur_node_chunk_dim = all_node_info[cur_node]["chunk_dim"]
                 cur_node_fix_dim = all_node_info[cur_node]["fix_dim"]
-                cur_node_idx = find_idx_by_name(cur_node.name, self.node_list)
                 if cur_node_chunk_dim:
                     cur_node_compute = self._find_compute_trace_from_node(cur_node)
                     cur_node_source = self._find_source_trace_from_node(cur_node)
@@ -818,6 +810,20 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
                     else:
                         raise NotImplementedError()
             cur_node_list = next_node_list
+        return all_node_info
+
+    def flow_search(self, start_idx, start_dim, end_idx, end_dim):
+        inputs, outputs = find_chunk_compute_input_and_output_nodes(
+            self.node_list[start_idx : end_idx + 1]
+        )
+        # only single ouput
+        if len(outputs) > 1:
+            return None
+
+        # get every node's chunk dim and fix dim
+        all_node_info = self._get_all_node_info(end_dim, start_idx, end_idx)
+        if all_node_info is None:
+            return None
 
         inputs_dim = []
         remove_inputs = []

From f4a1607e5645e3a537df6e88b67fb57a8fc6ed4f Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 15:36:17 +0800
Subject: [PATCH 091/209] seperate input node dim search

---
 colossalai/autochunk/index_tracer.py | 35 +++++++++++++++++-----------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/colossalai/autochunk/index_tracer.py b/colossalai/autochunk/index_tracer.py
index 0323e3a7e07d..221217e2d101 100644
--- a/colossalai/autochunk/index_tracer.py
+++ b/colossalai/autochunk/index_tracer.py
@@ -812,19 +812,7 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx):
             cur_node_list = next_node_list
         return all_node_info
 
-    def flow_search(self, start_idx, start_dim, end_idx, end_dim):
-        inputs, outputs = find_chunk_compute_input_and_output_nodes(
-            self.node_list[start_idx : end_idx + 1]
-        )
-        # only single ouput
-        if len(outputs) > 1:
-            return None
-
-        # get every node's chunk dim and fix dim
-        all_node_info = self._get_all_node_info(end_dim, start_idx, end_idx)
-        if all_node_info is None:
-            return None
-
+    def _get_input_nodes_dim(self, inputs, start_idx, end_idx, all_node_info):
         inputs_dim = []
         remove_inputs = []
         for input_node in inputs:
@@ -841,7 +829,7 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
                         if input_node_idx in user_source:
                             input_dict[user_idx] = user_source[input_node_idx]
                         else:
-                            return None
+                            return None, None
             if len(input_dict) == 0:
                 remove_inputs.append(input_node)
             else:
@@ -849,6 +837,25 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
         for i in remove_inputs:
             if i in inputs:
                 inputs.remove(i)
+        return inputs, inputs_dim
+
+    def flow_search(self, start_idx, start_dim, end_idx, end_dim):
+        inputs, outputs = find_chunk_compute_input_and_output_nodes(
+            self.node_list[start_idx : end_idx + 1]
+        )
+        # only single ouput
+        if len(outputs) > 1:
+            return None
+
+        # get every node's chunk dim and fix dim
+        all_node_info = self._get_all_node_info(end_dim, start_idx, end_idx)
+        if all_node_info is None:
+            return None
+
+        # get input nodes' chunk dim
+        inputs, inputs_dim = self._get_input_nodes_dim(inputs, start_idx, end_idx, all_node_info)
+        if inputs is None:
+            return None
 
         chunk_info = {
             "region": (start_idx, end_idx),

From f856611d217e13c11ea382fe9d8f8af4cdeabb49 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 15:47:17 +0800
Subject: [PATCH 092/209] seperate prepose_nodes

---
 colossalai/autochunk/index_tracer.py | 68 +++++++++++++++-------------
 1 file changed, 36 insertions(+), 32 deletions(-)

diff --git a/colossalai/autochunk/index_tracer.py b/colossalai/autochunk/index_tracer.py
index 221217e2d101..206d2edbd5df 100644
--- a/colossalai/autochunk/index_tracer.py
+++ b/colossalai/autochunk/index_tracer.py
@@ -839,36 +839,7 @@ def _get_input_nodes_dim(self, inputs, start_idx, end_idx, all_node_info):
                 inputs.remove(i)
         return inputs, inputs_dim
 
-    def flow_search(self, start_idx, start_dim, end_idx, end_dim):
-        inputs, outputs = find_chunk_compute_input_and_output_nodes(
-            self.node_list[start_idx : end_idx + 1]
-        )
-        # only single ouput
-        if len(outputs) > 1:
-            return None
-
-        # get every node's chunk dim and fix dim
-        all_node_info = self._get_all_node_info(end_dim, start_idx, end_idx)
-        if all_node_info is None:
-            return None
-
-        # get input nodes' chunk dim
-        inputs, inputs_dim = self._get_input_nodes_dim(inputs, start_idx, end_idx, all_node_info)
-        if inputs is None:
-            return None
-
-        chunk_info = {
-            "region": (start_idx, end_idx),
-            "inputs": inputs,
-            "inputs_non_chunk": [],
-            "inputs_dim": inputs_dim,
-            "outputs": outputs,
-            "outputs_dim": end_dim,
-            "node_chunk_dim": all_node_info,
-            "args": {},
-        }
-
-        # move useless nodes ahead of loop
+    def _set_prepose_nodes(self, all_node_info, start_idx, end_idx):
         # get all possible prepose nodes
         maybe_prepose_nodes = []
         for node, node_info in all_node_info.items():
@@ -929,12 +900,45 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
                         maybe_prepose_nodes.remove(n)
         # sort by index
         prepose_nodes.sort(key=lambda x: find_idx_by_name(x.name, self.node_list))
-        chunk_info["args"]["prepose_nodes"] = prepose_nodes
+
+        return prepose_nodes
+    
+    def flow_search(self, start_idx, start_dim, end_idx, end_dim):
+        inputs, outputs = find_chunk_compute_input_and_output_nodes(
+            self.node_list[start_idx : end_idx + 1]
+        )
+        # only single ouput
+        if len(outputs) > 1:
+            return None
+
+        # get every node's chunk dim and fix dim
+        all_node_info = self._get_all_node_info(end_dim, start_idx, end_idx)
+        if all_node_info is None:
+            return None
+
+        # get input nodes' chunk dim
+        inputs, inputs_dim = self._get_input_nodes_dim(inputs, start_idx, end_idx, all_node_info)
+        if inputs is None:
+            return None
+
+        chunk_info = {
+            "region": (start_idx, end_idx),
+            "inputs": inputs,
+            "inputs_non_chunk": [],
+            "inputs_dim": inputs_dim,
+            "outputs": outputs,
+            "outputs_dim": end_dim,
+            "node_chunk_dim": all_node_info,
+            "args": {},
+        }
+
+        # move useless nodes ahead of loop
+        chunk_info["args"]["prepose_nodes"] = self._set_prepose_nodes(all_node_info, start_idx, end_idx)
 
         # we need to log input nodes to avoid deleteing them in the loop
         chunk_node_list = self.node_list[start_idx : end_idx + 1]
         # also need to get some prepose node's arg out of non_chunk_inputs
-        for n in prepose_nodes:
+        for n in chunk_info["args"]["prepose_nodes"]:
             chunk_node_list.remove(n)
         non_chunk_inputs = find_chunk_all_input_nodes(chunk_node_list)
         for i in non_chunk_inputs:

From 6685a9d022a912ab3d0a57486b045b92b3f681ce Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 15:53:24 +0800
Subject: [PATCH 093/209] seperate non chunk input

---
 colossalai/autochunk/index_tracer.py | 35 +++++++++++++++++-----------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/colossalai/autochunk/index_tracer.py b/colossalai/autochunk/index_tracer.py
index 206d2edbd5df..202044763b0f 100644
--- a/colossalai/autochunk/index_tracer.py
+++ b/colossalai/autochunk/index_tracer.py
@@ -839,7 +839,7 @@ def _get_input_nodes_dim(self, inputs, start_idx, end_idx, all_node_info):
                 inputs.remove(i)
         return inputs, inputs_dim
 
-    def _set_prepose_nodes(self, all_node_info, start_idx, end_idx):
+    def _get_prepose_nodes(self, all_node_info, start_idx, end_idx):
         # get all possible prepose nodes
         maybe_prepose_nodes = []
         for node, node_info in all_node_info.items():
@@ -902,7 +902,19 @@ def _set_prepose_nodes(self, all_node_info, start_idx, end_idx):
         prepose_nodes.sort(key=lambda x: find_idx_by_name(x.name, self.node_list))
 
         return prepose_nodes
-    
+
+    def _get_non_chunk_inputs(self, chunk_info, start_idx, end_idx):
+        # we need to log input nodes to avoid deleteing them in the loop
+        chunk_node_list = self.node_list[start_idx : end_idx + 1]
+        # also need to get some prepose node's arg out of non_chunk_inputs
+        for n in chunk_info["args"]["prepose_nodes"]:
+            chunk_node_list.remove(n)
+        non_chunk_inputs = find_chunk_all_input_nodes(chunk_node_list)
+        for i in non_chunk_inputs:
+            if i not in chunk_info["inputs"]:
+                chunk_info["inputs_non_chunk"].append(i)
+        return chunk_info
+
     def flow_search(self, start_idx, start_dim, end_idx, end_dim):
         inputs, outputs = find_chunk_compute_input_and_output_nodes(
             self.node_list[start_idx : end_idx + 1]
@@ -917,7 +929,9 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
             return None
 
         # get input nodes' chunk dim
-        inputs, inputs_dim = self._get_input_nodes_dim(inputs, start_idx, end_idx, all_node_info)
+        inputs, inputs_dim = self._get_input_nodes_dim(
+            inputs, start_idx, end_idx, all_node_info
+        )
         if inputs is None:
             return None
 
@@ -933,17 +947,12 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
         }
 
         # move useless nodes ahead of loop
-        chunk_info["args"]["prepose_nodes"] = self._set_prepose_nodes(all_node_info, start_idx, end_idx)
+        chunk_info["args"]["prepose_nodes"] = self._get_prepose_nodes(
+            all_node_info, start_idx, end_idx
+        )
 
-        # we need to log input nodes to avoid deleteing them in the loop
-        chunk_node_list = self.node_list[start_idx : end_idx + 1]
-        # also need to get some prepose node's arg out of non_chunk_inputs
-        for n in chunk_info["args"]["prepose_nodes"]:
-            chunk_node_list.remove(n)
-        non_chunk_inputs = find_chunk_all_input_nodes(chunk_node_list)
-        for i in non_chunk_inputs:
-            if i not in chunk_info["inputs"]:
-                chunk_info["inputs_non_chunk"].append(i)
+        # find non chunk inputs
+        chunk_info = self._get_non_chunk_inputs(chunk_info, start_idx, end_idx)
 
         # reassgin reshape size, some size may have changed due to chunk
         chunk_info = self._reassgin_reshape_size(chunk_info)

From c3d72f7db9e2fc28e9a3aa92749f08c7a7d51e42 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 16:53:01 +0800
Subject: [PATCH 094/209] seperate reorder

---
 colossalai/autochunk/autochunk_codegen.py   |  4 +--
 colossalai/autochunk/chunk_region_search.py |  7 +++--
 colossalai/autochunk/chunk_selector.py      |  8 ++++--
 colossalai/autochunk/index_tracer.py        | 31 ++++++++++++---------
 4 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index fbd5d5e368dc..b4144196accc 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -103,7 +103,7 @@ def emit_code_with_chunk(
     nodes,
     emit_node_func,
     delete_unused_value_func,
-    chunk_region_search,
+    chunk_region_search: ChunkRegionSearch,
     chunk_infos,
 ):
     """Emit code with nested activation checkpoint
@@ -133,7 +133,7 @@ def emit_code_with_chunk(
     chunk_outputs = [i["outputs"][0] for i in chunk_infos]
     chunk_outputs_dim = [i["outputs_dim"] for i in chunk_infos]
 
-    node_list = chunk_region_search.index_tracer.reorder_node_list(node_list)
+    node_list = chunk_region_search.reorder_graph.reorder_node_list(node_list)
     node_idx = 0
     region_idx = 0
     within_chunk_region = False
diff --git a/colossalai/autochunk/chunk_region_search.py b/colossalai/autochunk/chunk_region_search.py
index 7a0e8a36cd6c..47e2fe13ceb5 100644
--- a/colossalai/autochunk/chunk_region_search.py
+++ b/colossalai/autochunk/chunk_region_search.py
@@ -1,7 +1,7 @@
 import copy
 
 from .chunk_selector import ChunkSelector
-from .index_tracer import IndexTracer
+from .index_tracer import IndexTracer, ReorderGraph
 from .memory_estiamtor import MemoryEstimator
 from .utils import (
     get_node_shape,
@@ -16,9 +16,10 @@ def __init__(self, gm, max_memory=None, print_mem=False) -> None:
         self.print_mem = print_mem
         self.index_tracer = IndexTracer(list(gm.graph.nodes))
         self.index_tracer.trace_index()
+        self.reorder_graph = ReorderGraph(self.index_tracer)
         self.memory_estimator = MemoryEstimator()
         self.chunk_selector = ChunkSelector(
-            self.index_tracer, self.memory_estimator, max_memory=max_memory
+            self.index_tracer, self.memory_estimator, self.reorder_graph, max_memory=max_memory
         )
 
     def _find_peak_node(self, mem_peak):
@@ -175,7 +176,7 @@ def _step_search(self, mem_peak, active_node, chunk_regions):
         best_chunk_region = self.chunk_selector._select_best_chunk_region(
             possible_chunk_regions, chunk_regions, peak_node, max_chunk_region, mem_peak
         )
-        best_chunk_region = self.index_tracer.reorder_all(best_chunk_region)
+        best_chunk_region = self.reorder_graph.reorder_all(best_chunk_region)
         return best_chunk_region
 
     def _stop_search(self, init_mem_peak, mem_peak):
diff --git a/colossalai/autochunk/chunk_selector.py b/colossalai/autochunk/chunk_selector.py
index aeab66572099..119ff8aafdd0 100644
--- a/colossalai/autochunk/chunk_selector.py
+++ b/colossalai/autochunk/chunk_selector.py
@@ -1,4 +1,4 @@
-from .index_tracer import IndexTracer
+from .index_tracer import IndexTracer, ReorderGraph
 from .memory_estiamtor import MemoryEstimator
 from .utils import is_non_compute_node
 
@@ -8,10 +8,12 @@ def __init__(
         self,
         index_tracer: IndexTracer,
         memory_estimator: MemoryEstimator,
+        reorder_graph: ReorderGraph,
         max_memory=None,
     ):
         self.index_tracer = index_tracer
         self.memory_estimator = memory_estimator
+        self.reorder_graph = reorder_graph
         if max_memory is not None:
             self.stratge = "fit_memory"
             self.max_memory = max_memory  # MB
@@ -64,7 +66,7 @@ def _select_fit_memory_chunk_region(
         regions_dict = []
         for region in possible_chunk_regions:
             cur_region = region.copy()
-            cur_node_list, cur_region = self.index_tracer.tmp_reorder(
+            cur_node_list, cur_region = self.reorder_graph.tmp_reorder(
                 self.index_tracer.node_list, cur_region
             )
             cur_chunk_infos = chunk_infos + [cur_region]
@@ -174,7 +176,7 @@ def _select_min_memory_chunk_region(
         regions_dict = []
         for region in possible_chunk_regions:
             cur_region = region.copy()
-            cur_node_list, cur_region = self.index_tracer.tmp_reorder(
+            cur_node_list, cur_region = self.reorder_graph.tmp_reorder(
                 self.index_tracer.node_list, cur_region
             )
             cur_chunk_infos = chunk_infos + [cur_region]
diff --git a/colossalai/autochunk/index_tracer.py b/colossalai/autochunk/index_tracer.py
index 202044763b0f..8b4d3aabd13a 100644
--- a/colossalai/autochunk/index_tracer.py
+++ b/colossalai/autochunk/index_tracer.py
@@ -17,7 +17,6 @@ def __init__(self, node_list) -> None:
         self.idx_trace_equal = []
         self.idx_view_list = {}
         self.idx_count = -1
-        self.all_reorder_map = {i: i for i in range(len(self.idx_trace_list))}
 
     def _init_idx_trace_list(self):
         idx_trace_list = []
@@ -981,24 +980,30 @@ def _reassgin_reshape_size(self, chunk_info):
         chunk_info["reshape_size"] = reshape_size
         return chunk_info
 
+
+class ReorderGraph(object):
+    def __init__(self, index_tracer: IndexTracer) -> None:
+        self.index_tracer = index_tracer
+        self.all_reorder_map = {i: i for i in range(len(self.index_tracer.idx_trace_list))}
+
     def _get_reorder_map(self, chunk_info):
-        reorder_map = {i: i for i in range(len(self.node_list))}
+        reorder_map = {i: i for i in range(len(self.index_tracer.node_list))}
 
         chunk_region_start = chunk_info["region"][0]
         chunk_region_end = chunk_info["region"][1]
         chunk_prepose_nodes = chunk_info["args"]["prepose_nodes"]
         chunk_prepose_nodes_idx = [
-            find_idx_by_name(i.name, self.node_list) for i in chunk_prepose_nodes
+            find_idx_by_name(i.name, self.index_tracer.node_list) for i in chunk_prepose_nodes
         ]
         # put prepose nodes ahead
         for idx, n in enumerate(chunk_prepose_nodes):
             n_idx = chunk_prepose_nodes_idx[idx]
             reorder_map[n_idx] = chunk_region_start + idx
         # put other nodes after prepose nodes
-        for n in self.node_list[chunk_region_start : chunk_region_end + 1]:
+        for n in self.index_tracer.node_list[chunk_region_start : chunk_region_end + 1]:
             if n in chunk_prepose_nodes:
                 continue
-            n_idx = find_idx_by_name(n.name, self.node_list)
+            n_idx = find_idx_by_name(n.name, self.index_tracer.node_list)
             pos = sum([n_idx < i for i in chunk_prepose_nodes_idx])
             reorder_map[n_idx] = n_idx + pos
 
@@ -1024,25 +1029,25 @@ def _update_all_reorder_map(self, reorder_map):
             self.all_reorder_map[origin_idx] = reorder_map[map_idx]
 
     def _reorder_self_node_list(self, reorder_map):
-        new_node_list = [None for _ in range(len(self.node_list))]
+        new_node_list = [None for _ in range(len(self.index_tracer.node_list))]
         for old_idx, new_idx in reorder_map.items():
-            new_node_list[new_idx] = self.node_list[old_idx]
-        self.node_list = new_node_list
+            new_node_list[new_idx] = self.index_tracer.node_list[old_idx]
+        self.index_tracer.node_list = new_node_list
 
     def _reorder_idx_trace(self, reorder_map):
         # reorder list
-        new_idx_trace_list = [None for _ in range(len(self.idx_trace_list))]
+        new_idx_trace_list = [None for _ in range(len(self.index_tracer.idx_trace_list))]
         for old_idx, new_idx in reorder_map.items():
-            new_idx_trace_list[new_idx] = self.idx_trace_list[old_idx]
-        self.idx_trace_list = new_idx_trace_list
+            new_idx_trace_list[new_idx] = self.index_tracer.idx_trace_list[old_idx]
+        self.index_tracer.idx_trace_list = new_idx_trace_list
         # update compute
-        for idx_trace in self.idx_trace_list:
+        for idx_trace in self.index_tracer.idx_trace_list:
             compute = idx_trace["compute"]
             for dim_compute in compute:
                 for idx, i in enumerate(dim_compute):
                     dim_compute[idx] = reorder_map[i]
         # update source
-        for idx_trace in self.idx_trace_list:
+        for idx_trace in self.index_tracer.idx_trace_list:
             source = idx_trace["source"]
             for dim_idx, dim_source in enumerate(source):
                 new_dim_source = {}

From da4076846d693be0153c8e89ee48ce25f56d09ce Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 17:09:37 +0800
Subject: [PATCH 095/209] rename

---
 colossalai/autochunk/autochunk_codegen.py          |  6 +++---
 .../{memory_estiamtor.py => estiamte_memory.py}    |  3 +--
 .../{chunk_region_search.py => search_chunk.py}    | 14 +++++++-------
 .../{chunk_selector.py => select_chunk.py}         | 10 +++++-----
 .../autochunk/{index_tracer.py => trace_index.py}  |  4 ++--
 tests/test_autochunk/benchmark_autochunk.py        |  2 +-
 6 files changed, 19 insertions(+), 20 deletions(-)
 rename colossalai/autochunk/{memory_estiamtor.py => estiamte_memory.py} (99%)
 rename colossalai/autochunk/{chunk_region_search.py => search_chunk.py} (96%)
 rename colossalai/autochunk/{chunk_selector.py => select_chunk.py} (97%)
 rename colossalai/autochunk/{index_tracer.py => trace_index.py} (99%)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index b4144196accc..3bb2e83be242 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -17,7 +17,7 @@
 
 import colossalai
 
-from .chunk_region_search import ChunkRegionSearch
+from .search_chunk import SearchChunk
 from .utils import delete_free_var_from_last_use, find_idx_by_name, get_node_shape
 
 CODEGEN_AVAILABLE = True
@@ -103,7 +103,7 @@ def emit_code_with_chunk(
     nodes,
     emit_node_func,
     delete_unused_value_func,
-    chunk_region_search: ChunkRegionSearch,
+    chunk_region_search: SearchChunk,
     chunk_infos,
 ):
     """Emit code with nested activation checkpoint
@@ -220,7 +220,7 @@ def __init__(self, meta_graph, max_memory=None, print_mem=False):
             self.max_memory = max_memory
             self.meta_node = list(meta_graph.graph.nodes)
             # find the chunk regions
-            self.chunk_region_search = ChunkRegionSearch(
+            self.chunk_region_search = SearchChunk(
                 meta_graph, max_memory, print_mem
             )
             self.chunk_infos = self.chunk_region_search.search_region()
diff --git a/colossalai/autochunk/memory_estiamtor.py b/colossalai/autochunk/estiamte_memory.py
similarity index 99%
rename from colossalai/autochunk/memory_estiamtor.py
rename to colossalai/autochunk/estiamte_memory.py
index 034f59e52858..90cfd66a00d5 100644
--- a/colossalai/autochunk/memory_estiamtor.py
+++ b/colossalai/autochunk/estiamte_memory.py
@@ -6,7 +6,6 @@
 
 from colossalai.fx.profiler import activation_size, parameter_size
 
-from .index_tracer import IndexTracer
 from .utils import (
     delete_free_var_from_last_use,
     find_idx_by_name,
@@ -15,7 +14,7 @@
 )
 
 
-class MemoryEstimator(object):
+class EstimateMemory(object):
     def __init__(self) -> None:
         pass
 
diff --git a/colossalai/autochunk/chunk_region_search.py b/colossalai/autochunk/search_chunk.py
similarity index 96%
rename from colossalai/autochunk/chunk_region_search.py
rename to colossalai/autochunk/search_chunk.py
index 47e2fe13ceb5..5c58bda0c393 100644
--- a/colossalai/autochunk/chunk_region_search.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -1,8 +1,8 @@
 import copy
 
-from .chunk_selector import ChunkSelector
-from .index_tracer import IndexTracer, ReorderGraph
-from .memory_estiamtor import MemoryEstimator
+from .select_chunk import SelectChunk
+from .trace_index import TraceIndex, ReorderGraph
+from .estiamte_memory import EstimateMemory
 from .utils import (
     get_node_shape,
     is_non_compute_node,
@@ -10,15 +10,15 @@
 )
 
 
-class ChunkRegionSearch(object):
+class SearchChunk(object):
     def __init__(self, gm, max_memory=None, print_mem=False) -> None:
         self.gm = gm
         self.print_mem = print_mem
-        self.index_tracer = IndexTracer(list(gm.graph.nodes))
+        self.index_tracer = TraceIndex(list(gm.graph.nodes))
         self.index_tracer.trace_index()
         self.reorder_graph = ReorderGraph(self.index_tracer)
-        self.memory_estimator = MemoryEstimator()
-        self.chunk_selector = ChunkSelector(
+        self.memory_estimator = EstimateMemory()
+        self.chunk_selector = SelectChunk(
             self.index_tracer, self.memory_estimator, self.reorder_graph, max_memory=max_memory
         )
 
diff --git a/colossalai/autochunk/chunk_selector.py b/colossalai/autochunk/select_chunk.py
similarity index 97%
rename from colossalai/autochunk/chunk_selector.py
rename to colossalai/autochunk/select_chunk.py
index 119ff8aafdd0..f0262f1e57eb 100644
--- a/colossalai/autochunk/chunk_selector.py
+++ b/colossalai/autochunk/select_chunk.py
@@ -1,13 +1,13 @@
-from .index_tracer import IndexTracer, ReorderGraph
-from .memory_estiamtor import MemoryEstimator
+from .trace_index import TraceIndex, ReorderGraph
+from .estiamte_memory import EstimateMemory
 from .utils import is_non_compute_node
 
 
-class ChunkSelector(object):
+class SelectChunk(object):
     def __init__(
         self,
-        index_tracer: IndexTracer,
-        memory_estimator: MemoryEstimator,
+        index_tracer: TraceIndex,
+        memory_estimator: EstimateMemory,
         reorder_graph: ReorderGraph,
         max_memory=None,
     ):
diff --git a/colossalai/autochunk/index_tracer.py b/colossalai/autochunk/trace_index.py
similarity index 99%
rename from colossalai/autochunk/index_tracer.py
rename to colossalai/autochunk/trace_index.py
index 8b4d3aabd13a..103a05dadbf5 100644
--- a/colossalai/autochunk/index_tracer.py
+++ b/colossalai/autochunk/trace_index.py
@@ -10,7 +10,7 @@
 )
 
 
-class IndexTracer(object):
+class TraceIndex(object):
     def __init__(self, node_list) -> None:
         self.node_list = node_list
         self.idx_trace_list = self._init_idx_trace_list()
@@ -982,7 +982,7 @@ def _reassgin_reshape_size(self, chunk_info):
 
 
 class ReorderGraph(object):
-    def __init__(self, index_tracer: IndexTracer) -> None:
+    def __init__(self, index_tracer: TraceIndex) -> None:
         self.index_tracer = index_tracer
         self.all_reorder_map = {i: i for i in range(len(self.index_tracer.idx_trace_list))}
 
diff --git a/tests/test_autochunk/benchmark_autochunk.py b/tests/test_autochunk/benchmark_autochunk.py
index 9daaa364a710..081f01368a42 100644
--- a/tests/test_autochunk/benchmark_autochunk.py
+++ b/tests/test_autochunk/benchmark_autochunk.py
@@ -104,7 +104,7 @@ def benchmark_evoformer():
     model = evoformer_base().cuda()
 
     # build autochunk model
-    # max_memory = 10000  # MB fit memory mode
+    # max_memory = 1000  # MB fit memory mode
     max_memory = None  # min memory mode
     autochunk = _build_autochunk(evoformer_base().cuda(), max_memory, node, pair)
 

From 4748967fb12747043c6688b3f13190203ade769f Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 17:13:18 +0800
Subject: [PATCH 096/209] ad reorder graph

---
 colossalai/autochunk/reorder_graph.py | 108 ++++++++++++++++++++++++++
 colossalai/autochunk/trace_index.py   | 106 -------------------------
 2 files changed, 108 insertions(+), 106 deletions(-)
 create mode 100644 colossalai/autochunk/reorder_graph.py

diff --git a/colossalai/autochunk/reorder_graph.py b/colossalai/autochunk/reorder_graph.py
new file mode 100644
index 000000000000..7b9f4a20d6ab
--- /dev/null
+++ b/colossalai/autochunk/reorder_graph.py
@@ -0,0 +1,108 @@
+from .trace_index import TraceIndex
+from .utils import find_idx_by_name
+
+
+class ReorderGraph(object):
+    def __init__(self, index_tracer: TraceIndex) -> None:
+        self.index_tracer = index_tracer
+        self.all_reorder_map = {i: i for i in range(len(self.index_tracer.idx_trace_list))}
+
+    def _get_reorder_map(self, chunk_info):
+        reorder_map = {i: i for i in range(len(self.index_tracer.node_list))}
+
+        chunk_region_start = chunk_info["region"][0]
+        chunk_region_end = chunk_info["region"][1]
+        chunk_prepose_nodes = chunk_info["args"]["prepose_nodes"]
+        chunk_prepose_nodes_idx = [
+            find_idx_by_name(i.name, self.index_tracer.node_list) for i in chunk_prepose_nodes
+        ]
+        # put prepose nodes ahead
+        for idx, n in enumerate(chunk_prepose_nodes):
+            n_idx = chunk_prepose_nodes_idx[idx]
+            reorder_map[n_idx] = chunk_region_start + idx
+        # put other nodes after prepose nodes
+        for n in self.index_tracer.node_list[chunk_region_start : chunk_region_end + 1]:
+            if n in chunk_prepose_nodes:
+                continue
+            n_idx = find_idx_by_name(n.name, self.index_tracer.node_list)
+            pos = sum([n_idx < i for i in chunk_prepose_nodes_idx])
+            reorder_map[n_idx] = n_idx + pos
+
+        return reorder_map
+
+    def _reorder_chunk_info(self, chunk_info, reorder_map):
+        # update chunk info
+        chunk_info["region"] = (
+            chunk_info["region"][0] + len(chunk_info["args"]["prepose_nodes"]),
+            chunk_info["region"][1],
+        )
+        new_inputs_dim = []
+        for idx, input_dim in enumerate(chunk_info["inputs_dim"]):
+            new_input_dim = {}
+            for k, v in input_dim.items():
+                new_input_dim[reorder_map[k]] = v
+            new_inputs_dim.append(new_input_dim)
+        chunk_info["inputs_dim"] = new_inputs_dim
+        return chunk_info
+
+    def _update_all_reorder_map(self, reorder_map):
+        for origin_idx, map_idx in self.all_reorder_map.items():
+            self.all_reorder_map[origin_idx] = reorder_map[map_idx]
+
+    def _reorder_self_node_list(self, reorder_map):
+        new_node_list = [None for _ in range(len(self.index_tracer.node_list))]
+        for old_idx, new_idx in reorder_map.items():
+            new_node_list[new_idx] = self.index_tracer.node_list[old_idx]
+        self.index_tracer.node_list = new_node_list
+
+    def _reorder_idx_trace(self, reorder_map):
+        # reorder list
+        new_idx_trace_list = [None for _ in range(len(self.index_tracer.idx_trace_list))]
+        for old_idx, new_idx in reorder_map.items():
+            new_idx_trace_list[new_idx] = self.index_tracer.idx_trace_list[old_idx]
+        self.index_tracer.idx_trace_list = new_idx_trace_list
+        # update compute
+        for idx_trace in self.index_tracer.idx_trace_list:
+            compute = idx_trace["compute"]
+            for dim_compute in compute:
+                for idx, i in enumerate(dim_compute):
+                    dim_compute[idx] = reorder_map[i]
+        # update source
+        for idx_trace in self.index_tracer.idx_trace_list:
+            source = idx_trace["source"]
+            for dim_idx, dim_source in enumerate(source):
+                new_dim_source = {}
+                for k, v in dim_source.items():
+                    new_dim_source[reorder_map[k]] = v
+                source[dim_idx] = new_dim_source
+
+    def reorder_all(self, chunk_info):
+        if chunk_info is None:
+            return chunk_info
+        if len(chunk_info["args"]["prepose_nodes"]) == 0:
+            return chunk_info
+        reorder_map = self._get_reorder_map(chunk_info)
+        self._update_all_reorder_map(reorder_map)
+        self._reorder_idx_trace(reorder_map)
+        self._reorder_self_node_list(reorder_map)
+        chunk_info = self._reorder_chunk_info(chunk_info, reorder_map)
+        return chunk_info
+
+    def reorder_node_list(self, node_list):
+        new_node_list = [None for _ in range(len(node_list))]
+        for old_idx, new_idx in self.all_reorder_map.items():
+            new_node_list[new_idx] = node_list[old_idx]
+        return new_node_list
+
+    def tmp_reorder(self, node_list, chunk_info):
+        if len(chunk_info["args"]["prepose_nodes"]) == 0:
+            return node_list, chunk_info
+        reorder_map = self._get_reorder_map(chunk_info)
+
+        # new tmp node list
+        new_node_list = [None for _ in range(len(node_list))]
+        for old_idx, new_idx in reorder_map.items():
+            new_node_list[new_idx] = node_list[old_idx]
+
+        chunk_info = self._reorder_chunk_info(chunk_info, reorder_map)
+        return new_node_list, chunk_info
diff --git a/colossalai/autochunk/trace_index.py b/colossalai/autochunk/trace_index.py
index 103a05dadbf5..3ac0d7f84272 100644
--- a/colossalai/autochunk/trace_index.py
+++ b/colossalai/autochunk/trace_index.py
@@ -979,109 +979,3 @@ def _reassgin_reshape_size(self, chunk_info):
                         )
         chunk_info["reshape_size"] = reshape_size
         return chunk_info
-
-
-class ReorderGraph(object):
-    def __init__(self, index_tracer: TraceIndex) -> None:
-        self.index_tracer = index_tracer
-        self.all_reorder_map = {i: i for i in range(len(self.index_tracer.idx_trace_list))}
-
-    def _get_reorder_map(self, chunk_info):
-        reorder_map = {i: i for i in range(len(self.index_tracer.node_list))}
-
-        chunk_region_start = chunk_info["region"][0]
-        chunk_region_end = chunk_info["region"][1]
-        chunk_prepose_nodes = chunk_info["args"]["prepose_nodes"]
-        chunk_prepose_nodes_idx = [
-            find_idx_by_name(i.name, self.index_tracer.node_list) for i in chunk_prepose_nodes
-        ]
-        # put prepose nodes ahead
-        for idx, n in enumerate(chunk_prepose_nodes):
-            n_idx = chunk_prepose_nodes_idx[idx]
-            reorder_map[n_idx] = chunk_region_start + idx
-        # put other nodes after prepose nodes
-        for n in self.index_tracer.node_list[chunk_region_start : chunk_region_end + 1]:
-            if n in chunk_prepose_nodes:
-                continue
-            n_idx = find_idx_by_name(n.name, self.index_tracer.node_list)
-            pos = sum([n_idx < i for i in chunk_prepose_nodes_idx])
-            reorder_map[n_idx] = n_idx + pos
-
-        return reorder_map
-
-    def _reorder_chunk_info(self, chunk_info, reorder_map):
-        # update chunk info
-        chunk_info["region"] = (
-            chunk_info["region"][0] + len(chunk_info["args"]["prepose_nodes"]),
-            chunk_info["region"][1],
-        )
-        new_inputs_dim = []
-        for idx, input_dim in enumerate(chunk_info["inputs_dim"]):
-            new_input_dim = {}
-            for k, v in input_dim.items():
-                new_input_dim[reorder_map[k]] = v
-            new_inputs_dim.append(new_input_dim)
-        chunk_info["inputs_dim"] = new_inputs_dim
-        return chunk_info
-
-    def _update_all_reorder_map(self, reorder_map):
-        for origin_idx, map_idx in self.all_reorder_map.items():
-            self.all_reorder_map[origin_idx] = reorder_map[map_idx]
-
-    def _reorder_self_node_list(self, reorder_map):
-        new_node_list = [None for _ in range(len(self.index_tracer.node_list))]
-        for old_idx, new_idx in reorder_map.items():
-            new_node_list[new_idx] = self.index_tracer.node_list[old_idx]
-        self.index_tracer.node_list = new_node_list
-
-    def _reorder_idx_trace(self, reorder_map):
-        # reorder list
-        new_idx_trace_list = [None for _ in range(len(self.index_tracer.idx_trace_list))]
-        for old_idx, new_idx in reorder_map.items():
-            new_idx_trace_list[new_idx] = self.index_tracer.idx_trace_list[old_idx]
-        self.index_tracer.idx_trace_list = new_idx_trace_list
-        # update compute
-        for idx_trace in self.index_tracer.idx_trace_list:
-            compute = idx_trace["compute"]
-            for dim_compute in compute:
-                for idx, i in enumerate(dim_compute):
-                    dim_compute[idx] = reorder_map[i]
-        # update source
-        for idx_trace in self.index_tracer.idx_trace_list:
-            source = idx_trace["source"]
-            for dim_idx, dim_source in enumerate(source):
-                new_dim_source = {}
-                for k, v in dim_source.items():
-                    new_dim_source[reorder_map[k]] = v
-                source[dim_idx] = new_dim_source
-
-    def reorder_all(self, chunk_info):
-        if chunk_info is None:
-            return chunk_info
-        if len(chunk_info["args"]["prepose_nodes"]) == 0:
-            return chunk_info
-        reorder_map = self._get_reorder_map(chunk_info)
-        self._update_all_reorder_map(reorder_map)
-        self._reorder_idx_trace(reorder_map)
-        self._reorder_self_node_list(reorder_map)
-        chunk_info = self._reorder_chunk_info(chunk_info, reorder_map)
-        return chunk_info
-
-    def reorder_node_list(self, node_list):
-        new_node_list = [None for _ in range(len(node_list))]
-        for old_idx, new_idx in self.all_reorder_map.items():
-            new_node_list[new_idx] = node_list[old_idx]
-        return new_node_list
-
-    def tmp_reorder(self, node_list, chunk_info):
-        if len(chunk_info["args"]["prepose_nodes"]) == 0:
-            return node_list, chunk_info
-        reorder_map = self._get_reorder_map(chunk_info)
-
-        # new tmp node list
-        new_node_list = [None for _ in range(len(node_list))]
-        for old_idx, new_idx in reorder_map.items():
-            new_node_list[new_idx] = node_list[old_idx]
-
-        chunk_info = self._reorder_chunk_info(chunk_info, reorder_map)
-        return new_node_list, chunk_info

From a6cdbf9161afc526d3a961708c0b202ca18c3e7e Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 17:24:23 +0800
Subject: [PATCH 097/209] seperate trace flow

---
 colossalai/autochunk/autochunk_codegen.py   |   2 +-
 colossalai/autochunk/search_chunk.py        |  53 +--
 colossalai/autochunk/select_chunk.py        |   3 +-
 colossalai/autochunk/trace_flow.py          | 414 ++++++++++++++++++++
 colossalai/autochunk/trace_index.py         | 395 -------------------
 tests/test_autochunk/benchmark_autochunk.py |   4 +-
 6 files changed, 447 insertions(+), 424 deletions(-)
 create mode 100644 colossalai/autochunk/trace_flow.py

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 3bb2e83be242..39728cb794f7 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -167,7 +167,7 @@ def emit_code_with_chunk(
                         )
             # ones like
             if "ones_like" in node.name:
-                meta_node = chunk_region_search.index_tracer.node_list[node_idx]
+                meta_node = chunk_region_search.trace_index.node_list[node_idx]
                 chunk_dim = chunk_infos[region_idx]["node_chunk_dim"][meta_node][
                     "chunk_dim"
                 ]
diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index 5c58bda0c393..030b13bdb9c4 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -1,8 +1,10 @@
 import copy
 
 from .select_chunk import SelectChunk
-from .trace_index import TraceIndex, ReorderGraph
+from .trace_index import TraceIndex
+from .reorder_graph import ReorderGraph
 from .estiamte_memory import EstimateMemory
+from .trace_flow import TraceFlow
 from .utils import (
     get_node_shape,
     is_non_compute_node,
@@ -14,12 +16,13 @@ class SearchChunk(object):
     def __init__(self, gm, max_memory=None, print_mem=False) -> None:
         self.gm = gm
         self.print_mem = print_mem
-        self.index_tracer = TraceIndex(list(gm.graph.nodes))
-        self.index_tracer.trace_index()
-        self.reorder_graph = ReorderGraph(self.index_tracer)
-        self.memory_estimator = EstimateMemory()
-        self.chunk_selector = SelectChunk(
-            self.index_tracer, self.memory_estimator, self.reorder_graph, max_memory=max_memory
+        self.trace_index = TraceIndex(list(gm.graph.nodes))
+        self.trace_index.trace_index()
+        self.trace_flow = TraceFlow(self.trace_index)
+        self.reorder_graph = ReorderGraph(self.trace_index)
+        self.estimate_memory = EstimateMemory()
+        self.select_chunk = SelectChunk(
+            self.trace_index, self.estimate_memory, self.reorder_graph, max_memory=max_memory
         )
 
     def _find_peak_node(self, mem_peak):
@@ -29,7 +32,7 @@ def _find_peak_node(self, mem_peak):
 
     def _get_free_var(self):
         free_var_idx = []
-        for idx, n in enumerate(self.index_tracer.node_list):
+        for idx, n in enumerate(self.trace_index.node_list):
             if n.op == "placeholder":
                 free_var_idx.append(idx)
         return free_var_idx
@@ -99,7 +102,7 @@ def _is_not_compute(self, trace, chunk_range, dim_idx):
     def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
         start_traces = input_trace[start_idx]
         end_trace = output_trace[end_idx]
-        end_node = self.index_tracer.node_list[end_idx]
+        end_node = self.trace_index.node_list[end_idx]
         chunk_infos = []
         for end_dim, _ in enumerate(end_trace["idx"]):
             if len(start_traces) > 1:
@@ -113,46 +116,46 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
                     ):
                         continue
                     # check index source align
-                    if not self.index_tracer.check_index_source(
+                    if not self.trace_flow.check_index_source(
                         start_dim, start_node, start_idx, end_dim, end_node
                     ):
                         continue
                     # check index copmute
-                    if not self.index_tracer.check_index_compute(
+                    if not self.trace_flow.check_index_compute(
                         start_idx, end_dim, end_node, end_idx
                     ):
                         continue
                     # flow search
-                    chunk_info = self.index_tracer.flow_search(
+                    chunk_info = self.trace_flow.flow_search(
                         start_idx, start_dim, end_idx, end_dim
                     )
                     if chunk_info is None:
                         continue
                     # check index copmute
-                    if not self.index_tracer.check_index_duplicate(chunk_info):
+                    if not self.trace_flow.check_index_duplicate(chunk_info):
                         continue
                     chunk_infos.append(chunk_info)
         return chunk_infos
 
     def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
         possible_chunk_region = []
-        output_trace = copy.deepcopy(self.index_tracer.idx_trace_list)
+        output_trace = copy.deepcopy(self.trace_index.idx_trace_list)
         input_trace = []  # trace of a node's input nodes
-        for _, n in enumerate(self.index_tracer.node_list):
+        for _, n in enumerate(self.trace_index.node_list):
             cur_trace = {}
             for arg in n.args:
                 if type(arg) == type(n) and not is_non_compute_node_except_placeholder(
                     arg
                 ):
-                    cur_trace[arg] = self.index_tracer._find_trace_from_node(arg)
+                    cur_trace[arg] = self.trace_index._find_trace_from_node(arg)
             input_trace.append(cur_trace)
 
         for start_idx in range(max_chunk_region[0], peak_node + 1):
             for end_idx in range(peak_node, max_chunk_region[1] + 1):
                 # skip non compute nodes
                 if is_non_compute_node(
-                    self.index_tracer.node_list[start_idx]
-                ) or is_non_compute_node(self.index_tracer.node_list[end_idx]):
+                    self.trace_index.node_list[start_idx]
+                ) or is_non_compute_node(self.trace_index.node_list[end_idx]):
                     continue
 
                 # select free dim
@@ -173,7 +176,7 @@ def _step_search(self, mem_peak, active_node, chunk_regions):
         possible_chunk_regions = self._search_possible_chunk_regions(
             max_chunk_region, peak_node
         )
-        best_chunk_region = self.chunk_selector._select_best_chunk_region(
+        best_chunk_region = self.select_chunk._select_best_chunk_region(
             possible_chunk_regions, chunk_regions, peak_node, max_chunk_region, mem_peak
         )
         best_chunk_region = self.reorder_graph.reorder_all(best_chunk_region)
@@ -191,8 +194,8 @@ def search_region(self):
             init_mem_peak,
             _,
             active_node,
-        ) = self.memory_estimator.estimate_chunk_inference_mem(
-            self.index_tracer.node_list
+        ) = self.estimate_memory.estimate_chunk_inference_mem(
+            self.trace_index.node_list
         )
         mem_peak = init_mem_peak
 
@@ -206,14 +209,14 @@ def search_region(self):
                 mem_peak,
                 _,
                 active_node,
-            ) = self.memory_estimator.estimate_chunk_inference_mem(
-                self.index_tracer.node_list, chunk_infos
+            ) = self.estimate_memory.estimate_chunk_inference_mem(
+                self.trace_index.node_list, chunk_infos
             )
             if self._stop_search(init_mem_peak, mem_peak):
                 break
         if self.print_mem:
             self.print_mem = False
-            self.memory_estimator.estimate_chunk_inference_mem(
-                self.index_tracer.node_list, chunk_infos, print_mem=True
+            self.estimate_memory.estimate_chunk_inference_mem(
+                self.trace_index.node_list, chunk_infos, print_mem=True
             )
         return chunk_infos
diff --git a/colossalai/autochunk/select_chunk.py b/colossalai/autochunk/select_chunk.py
index f0262f1e57eb..30f4226f54ec 100644
--- a/colossalai/autochunk/select_chunk.py
+++ b/colossalai/autochunk/select_chunk.py
@@ -1,4 +1,5 @@
-from .trace_index import TraceIndex, ReorderGraph
+from .trace_index import TraceIndex
+from .reorder_graph import ReorderGraph
 from .estiamte_memory import EstimateMemory
 from .utils import is_non_compute_node
 
diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py
new file mode 100644
index 000000000000..f372fa91335f
--- /dev/null
+++ b/colossalai/autochunk/trace_flow.py
@@ -0,0 +1,414 @@
+from .trace_index import TraceIndex
+from .utils import (
+    find_chunk_all_input_nodes,
+    find_chunk_compute_input_and_output_nodes,
+    find_idx_by_name,
+    get_node_shape,
+    is_non_compute_node,
+    is_non_compute_node_except_placeholder,
+)
+
+
+class TraceFlow(object):
+    def __init__(self, trace_index: TraceIndex) -> None:
+        self.trace_index = trace_index
+
+    def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node):
+        """
+        Check 2 given index: one index should be source of the other
+        Args:
+            start_idx(int): start node chunk dim
+            start_node(node): start node
+            end_idx(int): end node chunk dim
+            end_node(node): end node
+
+        Returns:
+            bool: True if check pass
+        """
+        start_node_idx = find_idx_by_name(start_node.name, self.trace_index.node_list)
+        end_node_trace = self.trace_index._find_trace_from_node(end_node)
+        end_node_trace_source = end_node_trace["source"][end_dim]
+        sorted_source = sorted(
+            end_node_trace_source.items(), key=lambda d: d[0], reverse=True
+        )
+        for node_idx, node_dim in sorted_source:
+            if node_idx == start_node_idx and start_dim in node_dim:
+                return True
+            # it means we meet a node outside the loop, and the node is not input node
+            if node_idx < start_idx:
+                return False
+        return False
+
+    def check_index_compute(self, start_idx, end_dim, end_node, end_idx):
+        """
+        Check 2 given index: check they haven't been computed in the source trace.
+        Args:
+            start_idx(int): start node chunk dim
+            start_node(node): start node
+            end_idx(int): end node chunk dim
+            end_node(node): end node
+
+        Returns:
+            bool: True if check pass
+        """
+        end_node_trace = self.trace_index._find_trace_from_node(end_node)
+        end_node_compute = end_node_trace["compute"][end_dim]
+        if any(start_idx <= i <= end_idx for i in end_node_compute):
+            return False
+        return True
+
+    def get_node_chunk_dim(self, node_from, node_from_dim, node_to):
+        node_from_source = self.trace_index._find_source_trace_from_node(node_from)
+        dim_source = node_from_source[node_from_dim]
+        node_to_idx = find_idx_by_name(node_to.name, self.trace_index.node_list)
+        for k, v in dim_source.items():
+            if k == node_to_idx:
+                return v
+        return None
+
+    def _find_inherit_dim(self, input_node, input_dim, node):
+        input_node_idx = find_idx_by_name(input_node.name, self.trace_index.node_list)
+        node_trace_source = self.trace_index._find_source_trace_from_node(node)
+        for node_dim in range(len(get_node_shape(node))):
+            if (
+                input_node_idx in node_trace_source[node_dim]
+                and input_dim[0] in node_trace_source[node_dim][input_node_idx]
+            ):
+                return node_dim
+        return None
+
+    def check_index_duplicate(self, chunk_infos, return_dim=False):
+        input_dim_after_node = {}
+        for input_node_idx, input_node in enumerate(chunk_infos["inputs"]):
+            for k, v in chunk_infos["inputs_dim"][input_node_idx].items():
+                inherit_dim = self._find_inherit_dim(input_node, v, self.trace_index.node_list[k])
+                if inherit_dim:
+                    input_dim_after_node[k] = inherit_dim
+
+        for node in self.trace_index.node_list[
+            chunk_infos["region"][0] : chunk_infos["region"][1] + 1
+        ]:
+            if is_non_compute_node_except_placeholder(node):
+                continue
+            count = 0
+            duplicate_dims = []
+            node_trace_source = self.trace_index._find_source_trace_from_node(node)
+            for node_dim in range(len(get_node_shape(node))):
+                duplicate_dim = []
+                duplicate_flag = False
+                dim_source = node_trace_source[node_dim]
+                for k, v in dim_source.items():
+                    if chunk_infos["region"][0] <= k <= chunk_infos["region"][1]:
+                        if k in input_dim_after_node and input_dim_after_node[k] in v:
+                            duplicate_flag = True
+                            duplicate_dim.append((k, v))
+                duplicate_dims.append(duplicate_dim)
+                if duplicate_flag:
+                    count += 1
+
+            if count > 1:
+                if return_dim:
+                    return False, duplicate_dims
+                else:
+                    return False
+        if return_dim:
+            return True, None
+        else:
+            return True
+
+    def _assgin_single_node_flow(
+        self,
+        arg_node,
+        start_idx,
+        end_idx,
+        cur_node_dim,
+        cur_node_compute,
+        cur_node_source,
+        cur_node_fix_dim,
+        all_node_info,
+        next_node_list,
+    ):
+        arg_idx = find_idx_by_name(arg_node.name, self.trace_index.node_list)
+        # arg in chunk range or be inputs
+        if not (start_idx <= arg_idx < end_idx):
+            return True
+
+        # find arg dim
+        if cur_node_dim is not None:
+            # dim is computed
+            if arg_idx in cur_node_compute[cur_node_dim]:
+                return False
+            if arg_idx not in cur_node_source[cur_node_dim]:
+                arg_dim = None
+            else:
+                arg_dim = cur_node_source[cur_node_dim][arg_idx][0]
+        else:
+            arg_dim = None
+
+        # get fix dim
+        arg_fix_dim = []
+        if cur_node_dim is not None:
+            for i in cur_node_fix_dim:
+                fix_dim_source = cur_node_source[i]
+                if arg_idx in fix_dim_source:
+                    arg_fix_dim.append(fix_dim_source[arg_idx][0])
+
+        # if already in node_info, arg dim must be same
+        if arg_node in all_node_info:
+            if all_node_info[arg_node]["chunk_dim"] != arg_dim:
+                return False
+            all_node_info[arg_node]["fix_dim"] = list(
+                set(all_node_info[arg_node]["fix_dim"] + arg_fix_dim)
+            )
+        # else add it to list
+        else:
+            all_node_info[arg_node] = {"chunk_dim": arg_dim, "fix_dim": arg_fix_dim}
+
+        next_node_list.append(arg_node)
+        return True
+
+    def _get_all_node_info(self, end_dim, start_idx, end_idx):
+        cur_node_list = [
+            self.trace_index.node_list[end_idx]
+        ]  # start from the last node
+        all_node_info = {cur_node_list[0]: {"chunk_dim": end_dim, "fix_dim": []}}
+
+        while len(cur_node_list) > 0:
+            next_node_list = []
+
+            for cur_node in cur_node_list:
+                # get cur node info
+                cur_node_chunk_dim = all_node_info[cur_node]["chunk_dim"]
+                cur_node_fix_dim = all_node_info[cur_node]["fix_dim"]
+                if cur_node_chunk_dim:
+                    cur_node_compute = self.trace_index._find_compute_trace_from_node(
+                        cur_node
+                    )
+                    cur_node_source = self.trace_index._find_source_trace_from_node(
+                        cur_node
+                    )
+                else:
+                    cur_node_compute = cur_node_source = None
+
+                # get all valid args
+                arg_list = []
+                for arg in cur_node.args:
+                    if type(arg) != type(cur_node):
+                        continue
+                    if is_non_compute_node(arg):
+                        continue
+                    arg_list.append(arg)
+                    flow_flag = self._assgin_single_node_flow(
+                        arg,
+                        start_idx,
+                        end_idx,
+                        cur_node_chunk_dim,
+                        cur_node_compute,
+                        cur_node_source,
+                        cur_node_fix_dim,
+                        all_node_info,
+                        next_node_list,
+                    )
+                    if flow_flag == False:
+                        return None
+
+                if len(arg_list) == 2:
+                    if any(i in cur_node.name for i in ["add", "mul"]):
+                        for arg in arg_list:
+                            if not (
+                                start_idx
+                                <= find_idx_by_name(arg.name, self.trace_index.node_list)
+                                < end_idx
+                            ):
+                                continue
+                            arg_chunk_dim = all_node_info[arg]["chunk_dim"]
+                            arg_fix_dim = all_node_info[arg]["fix_dim"]
+                            arg_shape = get_node_shape(arg)
+                            # add all dim as fix dim except chunk dim
+                            for i, shape in enumerate(arg_shape):
+                                if shape != 1 and i != cur_node_chunk_dim:
+                                    if i == arg_chunk_dim:
+                                        return None
+                                    if i not in arg_fix_dim:
+                                        arg_fix_dim.append(i)
+                    elif "einsum" in cur_node.name:
+                        pass
+                    elif "matmul" in cur_node.name:
+                        pass
+                    else:
+                        raise NotImplementedError()
+            cur_node_list = next_node_list
+        return all_node_info
+
+    def _get_input_nodes_dim(self, inputs, start_idx, end_idx, all_node_info):
+        inputs_dim = []
+        remove_inputs = []
+        for input_node in inputs:
+            input_dict = {}
+            input_node_idx = find_idx_by_name(
+                input_node.name, self.trace_index.node_list
+            )
+            for user in input_node.users.keys():
+                if is_non_compute_node(user):
+                    continue
+                user_idx = find_idx_by_name(user.name, self.trace_index.node_list)
+                if start_idx <= user_idx <= end_idx:
+                    chunk_dim = all_node_info[user]["chunk_dim"]
+                    if chunk_dim is not None:
+                        user_source = self.trace_index._find_source_trace_from_node(user)[chunk_dim]
+                        if input_node_idx in user_source:
+                            input_dict[user_idx] = user_source[input_node_idx]
+                        else:
+                            return None, None
+            if len(input_dict) == 0:
+                remove_inputs.append(input_node)
+            else:
+                inputs_dim.append(input_dict)
+        for i in remove_inputs:
+            if i in inputs:
+                inputs.remove(i)
+        return inputs, inputs_dim
+
+    def _get_prepose_nodes(self, all_node_info, start_idx, end_idx):
+        # get all possible prepose nodes
+        maybe_prepose_nodes = []
+        for node, node_info in all_node_info.items():
+            if node_info["chunk_dim"] is None:
+                maybe_prepose_nodes.append(node)
+        maybe_prepose_nodes.sort(
+            key=lambda x: find_idx_by_name(x.name, self.trace_index.node_list),
+            reverse=True,
+        )  # from last node to first node
+        prepose_nodes = []
+        # set every node as root, search its args, if all legal, turn root and args as prepose nodes
+        while len(maybe_prepose_nodes) > 0:
+            tmp_cur_prepose_nodes = [maybe_prepose_nodes[0]]
+            tmp_cur_related_prepose_nodes = []
+            prepose_flag = True
+
+            # loop cur node's all arg until out of chunk
+            while len(tmp_cur_prepose_nodes) > 0:
+                if prepose_flag == False:
+                    break
+                tmp_next_prepose_nodes = []
+                tmp_cur_related_prepose_nodes.extend(tmp_cur_prepose_nodes)
+                for cur_prepose_node in tmp_cur_prepose_nodes:
+                    if prepose_flag == False:
+                        break
+                    for cur_prepose_node_arg in cur_prepose_node.args:
+                        if type(cur_prepose_node_arg) != type(cur_prepose_node):
+                            continue
+                        # out of loop
+                        if not (
+                            start_idx
+                            <= find_idx_by_name(
+                                cur_prepose_node_arg.name, self.trace_index.node_list
+                            )
+                            < end_idx
+                        ):
+                            continue
+                        # compute op in loop
+                        elif cur_prepose_node_arg in all_node_info:
+                            if all_node_info[cur_prepose_node_arg]["chunk_dim"] is None:
+                                tmp_next_prepose_nodes.append(cur_prepose_node_arg)
+                            else:
+                                prepose_flag = False
+                                break
+                        # non compute op
+                        else:
+                            tmp_next_prepose_nodes.append(cur_prepose_node_arg)
+                tmp_cur_prepose_nodes = tmp_next_prepose_nodes
+
+            if prepose_flag == False:
+                maybe_prepose_nodes.remove(maybe_prepose_nodes[0])
+                continue
+            else:
+                for n in tmp_cur_related_prepose_nodes:
+                    if n not in prepose_nodes:
+                        prepose_nodes.append(n)
+                    if n in maybe_prepose_nodes:
+                        maybe_prepose_nodes.remove(n)
+        # sort by index
+        prepose_nodes.sort(
+            key=lambda x: find_idx_by_name(x.name, self.trace_index.node_list)
+        )
+
+        return prepose_nodes
+
+    def _get_non_chunk_inputs(self, chunk_info, start_idx, end_idx):
+        # we need to log input nodes to avoid deleteing them in the loop
+        chunk_node_list = self.trace_index.node_list[start_idx : end_idx + 1]
+        # also need to get some prepose node's arg out of non_chunk_inputs
+        for n in chunk_info["args"]["prepose_nodes"]:
+            chunk_node_list.remove(n)
+        non_chunk_inputs = find_chunk_all_input_nodes(chunk_node_list)
+        for i in non_chunk_inputs:
+            if i not in chunk_info["inputs"]:
+                chunk_info["inputs_non_chunk"].append(i)
+        return chunk_info
+
+    def flow_search(self, start_idx, start_dim, end_idx, end_dim):
+        inputs, outputs = find_chunk_compute_input_and_output_nodes(
+            self.trace_index.node_list[start_idx : end_idx + 1]
+        )
+        # only single ouput
+        if len(outputs) > 1:
+            return None
+
+        # get every node's chunk dim and fix dim
+        all_node_info = self._get_all_node_info(end_dim, start_idx, end_idx)
+        if all_node_info is None:
+            return None
+
+        # get input nodes' chunk dim
+        inputs, inputs_dim = self._get_input_nodes_dim(
+            inputs, start_idx, end_idx, all_node_info
+        )
+        if inputs is None:
+            return None
+
+        chunk_info = {
+            "region": (start_idx, end_idx),
+            "inputs": inputs,
+            "inputs_non_chunk": [],
+            "inputs_dim": inputs_dim,
+            "outputs": outputs,
+            "outputs_dim": end_dim,
+            "node_chunk_dim": all_node_info,
+            "args": {},
+        }
+
+        # move useless nodes ahead of loop
+        chunk_info["args"]["prepose_nodes"] = self._get_prepose_nodes(
+            all_node_info, start_idx, end_idx
+        )
+
+        # find non chunk inputs
+        chunk_info = self._get_non_chunk_inputs(chunk_info, start_idx, end_idx)
+
+        # reassgin reshape size, some size may have changed due to chunk
+        chunk_info = self._reassgin_reshape_size(chunk_info)
+
+        return chunk_info
+
+    def _reassgin_reshape_size(self, chunk_info):
+        chunk_region = chunk_info["region"]
+        reshape_size = {}
+        chunk_shape = get_node_shape(chunk_info["outputs"][0])[
+            chunk_info["outputs_dim"]
+        ]
+        for node in self.trace_index.node_list[chunk_region[0] : chunk_region[1] + 1]:
+            if any(i in node.name for i in ["reshape", "view"]):
+                reshape_args = node.args[1:]
+                reshape_log = self.trace_index.idx_view_list[node]
+                chunk_dim = chunk_info["node_chunk_dim"][node]["chunk_dim"]
+                reshape_size[node.name] = {}
+                for reshape_arg_dim, reshape_arg in enumerate(reshape_args):
+                    if reshape_arg_dim in reshape_log["dim_to"]:
+                        continue
+                    if reshape_arg_dim == chunk_dim:
+                        reshape_size[node.name][reshape_arg.name] = (
+                            "min(chunk_size, %d - chunk_idx)" % chunk_shape
+                        )
+        chunk_info["reshape_size"] = reshape_size
+        return chunk_info
diff --git a/colossalai/autochunk/trace_index.py b/colossalai/autochunk/trace_index.py
index 3ac0d7f84272..1e8969d8796e 100644
--- a/colossalai/autochunk/trace_index.py
+++ b/colossalai/autochunk/trace_index.py
@@ -1,12 +1,8 @@
 import copy
 
 from .utils import (
-    find_chunk_all_input_nodes,
-    find_chunk_compute_input_and_output_nodes,
     find_idx_by_name,
     get_node_shape,
-    is_non_compute_node,
-    is_non_compute_node_except_placeholder,
 )
 
 
@@ -588,394 +584,3 @@ def trace_index(self):
                 continue
             else:
                 raise NotImplementedError(node.op, "op not implemented yet!")
-        # self._merge_equal_idx()
-
-    def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node):
-        """
-        Check 2 given index: one index should be source of the other
-        Args:
-            start_idx(int): start node chunk dim
-            start_node(node): start node
-            end_idx(int): end node chunk dim
-            end_node(node): end node
-
-        Returns:
-            bool: True if check pass
-        """
-        start_node_idx = find_idx_by_name(start_node.name, self.node_list)
-        end_node_trace = self._find_trace_from_node(end_node)
-        end_node_trace_source = end_node_trace["source"][end_dim]
-        sorted_source = sorted(
-            end_node_trace_source.items(), key=lambda d: d[0], reverse=True
-        )
-        for node_idx, node_dim in sorted_source:
-            if node_idx == start_node_idx and start_dim in node_dim:
-                return True
-            # it means we meet a node outside the loop, and the node is not input node
-            if node_idx < start_idx:
-                return False
-        return False
-
-    def check_index_compute(self, start_idx, end_dim, end_node, end_idx):
-        """
-        Check 2 given index: check they haven't been computed in the source trace.
-        Args:
-            start_idx(int): start node chunk dim
-            start_node(node): start node
-            end_idx(int): end node chunk dim
-            end_node(node): end node
-
-        Returns:
-            bool: True if check pass
-        """
-        end_node_trace = self._find_trace_from_node(end_node)
-        end_node_compute = end_node_trace["compute"][end_dim]
-        if any(start_idx <= i <= end_idx for i in end_node_compute):
-            return False
-        return True
-
-    def get_node_chunk_dim(self, node_from, node_from_dim, node_to):
-        node_from_source = self._find_source_trace_from_node(node_from)
-        dim_source = node_from_source[node_from_dim]
-        node_to_idx = find_idx_by_name(node_to.name, self.node_list)
-        for k, v in dim_source.items():
-            if k == node_to_idx:
-                return v
-        return None
-
-    def _find_inherit_dim(self, input_node, input_dim, node):
-        input_node_idx = find_idx_by_name(input_node.name, self.node_list)
-        node_trace_source = self._find_source_trace_from_node(node)
-        for node_dim in range(len(get_node_shape(node))):
-            if (
-                input_node_idx in node_trace_source[node_dim]
-                and input_dim[0] in node_trace_source[node_dim][input_node_idx]
-            ):
-                return node_dim
-        return None
-
-    def check_index_duplicate(self, chunk_infos, return_dim=False):
-        input_dim_after_node = {}
-        for input_node_idx, input_node in enumerate(chunk_infos["inputs"]):
-            for k, v in chunk_infos["inputs_dim"][input_node_idx].items():
-                inherit_dim = self._find_inherit_dim(input_node, v, self.node_list[k])
-                if inherit_dim:
-                    input_dim_after_node[k] = inherit_dim
-
-        for node in self.node_list[
-            chunk_infos["region"][0] : chunk_infos["region"][1] + 1
-        ]:
-            if is_non_compute_node_except_placeholder(node):
-                continue
-            count = 0
-            duplicate_dims = []
-            node_trace_source = self._find_source_trace_from_node(node)
-            for node_dim in range(len(get_node_shape(node))):
-                duplicate_dim = []
-                duplicate_flag = False
-                dim_source = node_trace_source[node_dim]
-                for k, v in dim_source.items():
-                    if chunk_infos["region"][0] <= k <= chunk_infos["region"][1]:
-                        if k in input_dim_after_node and input_dim_after_node[k] in v:
-                            duplicate_flag = True
-                            duplicate_dim.append((k, v))
-                duplicate_dims.append(duplicate_dim)
-                if duplicate_flag:
-                    count += 1
-
-            if count > 1:
-                if return_dim:
-                    return False, duplicate_dims
-                else:
-                    return False
-        if return_dim:
-            return True, None
-        else:
-            return True
-
-    def _assgin_single_node_flow(
-        self,
-        arg_node,
-        start_idx,
-        end_idx,
-        cur_node_dim,
-        cur_node_compute,
-        cur_node_source,
-        cur_node_fix_dim,
-        all_node_info,
-        next_node_list,
-    ):
-        arg_idx = find_idx_by_name(arg_node.name, self.node_list)
-        # arg in chunk range or be inputs
-        if not (start_idx <= arg_idx < end_idx):
-            return True
-
-        # find arg dim
-        if cur_node_dim is not None:
-            # dim is computed
-            if arg_idx in cur_node_compute[cur_node_dim]:
-                return False
-            if arg_idx not in cur_node_source[cur_node_dim]:
-                arg_dim = None
-            else:
-                arg_dim = cur_node_source[cur_node_dim][arg_idx][0]
-        else:
-            arg_dim = None
-
-        # get fix dim
-        arg_fix_dim = []
-        if cur_node_dim is not None:
-            for i in cur_node_fix_dim:
-                fix_dim_source = cur_node_source[i]
-                if arg_idx in fix_dim_source:
-                    arg_fix_dim.append(fix_dim_source[arg_idx][0])
-
-        # if already in node_info, arg dim must be same
-        if arg_node in all_node_info:
-            if all_node_info[arg_node]["chunk_dim"] != arg_dim:
-                return False
-            all_node_info[arg_node]["fix_dim"] = list(
-                set(all_node_info[arg_node]["fix_dim"] + arg_fix_dim)
-            )
-        # else add it to list
-        else:
-            all_node_info[arg_node] = {"chunk_dim": arg_dim, "fix_dim": arg_fix_dim}
-
-        next_node_list.append(arg_node)
-        return True
-
-    def _get_all_node_info(self, end_dim, start_idx, end_idx):
-        cur_node_list = [self.node_list[end_idx]]  # start from the last node
-        all_node_info = {cur_node_list[0]: {"chunk_dim": end_dim, "fix_dim": []}}
-
-        while len(cur_node_list) > 0:
-            next_node_list = []
-
-            for cur_node in cur_node_list:
-                # get cur node info
-                cur_node_chunk_dim = all_node_info[cur_node]["chunk_dim"]
-                cur_node_fix_dim = all_node_info[cur_node]["fix_dim"]
-                if cur_node_chunk_dim:
-                    cur_node_compute = self._find_compute_trace_from_node(cur_node)
-                    cur_node_source = self._find_source_trace_from_node(cur_node)
-                else:
-                    cur_node_compute = cur_node_source = None
-
-                # get all valid args
-                arg_list = []
-                for arg in cur_node.args:
-                    if type(arg) != type(cur_node):
-                        continue
-                    if is_non_compute_node(arg):
-                        continue
-                    arg_list.append(arg)
-                    flow_flag = self._assgin_single_node_flow(
-                        arg,
-                        start_idx,
-                        end_idx,
-                        cur_node_chunk_dim,
-                        cur_node_compute,
-                        cur_node_source,
-                        cur_node_fix_dim,
-                        all_node_info,
-                        next_node_list,
-                    )
-                    if flow_flag == False:
-                        return None
-
-                if len(arg_list) == 2:
-                    if any(i in cur_node.name for i in ["add", "mul"]):
-                        for arg in arg_list:
-                            if not (
-                                start_idx
-                                <= find_idx_by_name(arg.name, self.node_list)
-                                < end_idx
-                            ):
-                                continue
-                            arg_chunk_dim = all_node_info[arg]["chunk_dim"]
-                            arg_fix_dim = all_node_info[arg]["fix_dim"]
-                            arg_shape = get_node_shape(arg)
-                            # add all dim as fix dim except chunk dim
-                            for i, shape in enumerate(arg_shape):
-                                if shape != 1 and i != cur_node_chunk_dim:
-                                    if i == arg_chunk_dim:
-                                        return None
-                                    if i not in arg_fix_dim:
-                                        arg_fix_dim.append(i)
-                    elif "einsum" in cur_node.name:
-                        pass
-                    elif "matmul" in cur_node.name:
-                        pass
-                    else:
-                        raise NotImplementedError()
-            cur_node_list = next_node_list
-        return all_node_info
-
-    def _get_input_nodes_dim(self, inputs, start_idx, end_idx, all_node_info):
-        inputs_dim = []
-        remove_inputs = []
-        for input_node in inputs:
-            input_dict = {}
-            input_node_idx = find_idx_by_name(input_node.name, self.node_list)
-            for user in input_node.users.keys():
-                if is_non_compute_node(user):
-                    continue
-                user_idx = find_idx_by_name(user.name, self.node_list)
-                if start_idx <= user_idx <= end_idx:
-                    chunk_dim = all_node_info[user]["chunk_dim"]
-                    if chunk_dim is not None:
-                        user_source = self._find_source_trace_from_node(user)[chunk_dim]
-                        if input_node_idx in user_source:
-                            input_dict[user_idx] = user_source[input_node_idx]
-                        else:
-                            return None, None
-            if len(input_dict) == 0:
-                remove_inputs.append(input_node)
-            else:
-                inputs_dim.append(input_dict)
-        for i in remove_inputs:
-            if i in inputs:
-                inputs.remove(i)
-        return inputs, inputs_dim
-
-    def _get_prepose_nodes(self, all_node_info, start_idx, end_idx):
-        # get all possible prepose nodes
-        maybe_prepose_nodes = []
-        for node, node_info in all_node_info.items():
-            if node_info["chunk_dim"] is None:
-                maybe_prepose_nodes.append(node)
-        maybe_prepose_nodes.sort(
-            key=lambda x: find_idx_by_name(x.name, self.node_list),
-            reverse=True,
-        )  # from last node to first node
-        prepose_nodes = []
-        # set every node as root, search its args, if all legal, turn root and args as prepose nodes
-        while len(maybe_prepose_nodes) > 0:
-            tmp_cur_prepose_nodes = [maybe_prepose_nodes[0]]
-            tmp_cur_related_prepose_nodes = []
-            prepose_flag = True
-
-            # loop cur node's all arg until out of chunk
-            while len(tmp_cur_prepose_nodes) > 0:
-                if prepose_flag == False:
-                    break
-                tmp_next_prepose_nodes = []
-                tmp_cur_related_prepose_nodes.extend(tmp_cur_prepose_nodes)
-                for cur_prepose_node in tmp_cur_prepose_nodes:
-                    if prepose_flag == False:
-                        break
-                    for cur_prepose_node_arg in cur_prepose_node.args:
-                        if type(cur_prepose_node_arg) != type(cur_prepose_node):
-                            continue
-                        # out of loop
-                        if not (
-                            start_idx
-                            <= find_idx_by_name(
-                                cur_prepose_node_arg.name, self.node_list
-                            )
-                            < end_idx
-                        ):
-                            continue
-                        # compute op in loop
-                        elif cur_prepose_node_arg in all_node_info:
-                            if all_node_info[cur_prepose_node_arg]["chunk_dim"] is None:
-                                tmp_next_prepose_nodes.append(cur_prepose_node_arg)
-                            else:
-                                prepose_flag = False
-                                break
-                        # non compute op
-                        else:
-                            tmp_next_prepose_nodes.append(cur_prepose_node_arg)
-                tmp_cur_prepose_nodes = tmp_next_prepose_nodes
-
-            if prepose_flag == False:
-                maybe_prepose_nodes.remove(maybe_prepose_nodes[0])
-                continue
-            else:
-                for n in tmp_cur_related_prepose_nodes:
-                    if n not in prepose_nodes:
-                        prepose_nodes.append(n)
-                    if n in maybe_prepose_nodes:
-                        maybe_prepose_nodes.remove(n)
-        # sort by index
-        prepose_nodes.sort(key=lambda x: find_idx_by_name(x.name, self.node_list))
-
-        return prepose_nodes
-
-    def _get_non_chunk_inputs(self, chunk_info, start_idx, end_idx):
-        # we need to log input nodes to avoid deleteing them in the loop
-        chunk_node_list = self.node_list[start_idx : end_idx + 1]
-        # also need to get some prepose node's arg out of non_chunk_inputs
-        for n in chunk_info["args"]["prepose_nodes"]:
-            chunk_node_list.remove(n)
-        non_chunk_inputs = find_chunk_all_input_nodes(chunk_node_list)
-        for i in non_chunk_inputs:
-            if i not in chunk_info["inputs"]:
-                chunk_info["inputs_non_chunk"].append(i)
-        return chunk_info
-
-    def flow_search(self, start_idx, start_dim, end_idx, end_dim):
-        inputs, outputs = find_chunk_compute_input_and_output_nodes(
-            self.node_list[start_idx : end_idx + 1]
-        )
-        # only single ouput
-        if len(outputs) > 1:
-            return None
-
-        # get every node's chunk dim and fix dim
-        all_node_info = self._get_all_node_info(end_dim, start_idx, end_idx)
-        if all_node_info is None:
-            return None
-
-        # get input nodes' chunk dim
-        inputs, inputs_dim = self._get_input_nodes_dim(
-            inputs, start_idx, end_idx, all_node_info
-        )
-        if inputs is None:
-            return None
-
-        chunk_info = {
-            "region": (start_idx, end_idx),
-            "inputs": inputs,
-            "inputs_non_chunk": [],
-            "inputs_dim": inputs_dim,
-            "outputs": outputs,
-            "outputs_dim": end_dim,
-            "node_chunk_dim": all_node_info,
-            "args": {},
-        }
-
-        # move useless nodes ahead of loop
-        chunk_info["args"]["prepose_nodes"] = self._get_prepose_nodes(
-            all_node_info, start_idx, end_idx
-        )
-
-        # find non chunk inputs
-        chunk_info = self._get_non_chunk_inputs(chunk_info, start_idx, end_idx)
-
-        # reassgin reshape size, some size may have changed due to chunk
-        chunk_info = self._reassgin_reshape_size(chunk_info)
-
-        return chunk_info
-
-    def _reassgin_reshape_size(self, chunk_info):
-        chunk_region = chunk_info["region"]
-        reshape_size = {}
-        chunk_shape = get_node_shape(chunk_info["outputs"][0])[
-            chunk_info["outputs_dim"]
-        ]
-        for node in self.node_list[chunk_region[0] : chunk_region[1] + 1]:
-            if any(i in node.name for i in ["reshape", "view"]):
-                reshape_args = node.args[1:]
-                reshape_log = self.idx_view_list[node]
-                chunk_dim = chunk_info["node_chunk_dim"][node]["chunk_dim"]
-                reshape_size[node.name] = {}
-                for reshape_arg_dim, reshape_arg in enumerate(reshape_args):
-                    if reshape_arg_dim in reshape_log["dim_to"]:
-                        continue
-                    if reshape_arg_dim == chunk_dim:
-                        reshape_size[node.name][reshape_arg.name] = (
-                            "min(chunk_size, %d - chunk_idx)" % chunk_shape
-                        )
-        chunk_info["reshape_size"] = reshape_size
-        return chunk_info
diff --git a/tests/test_autochunk/benchmark_autochunk.py b/tests/test_autochunk/benchmark_autochunk.py
index 081f01368a42..7a9d8cdeee03 100644
--- a/tests/test_autochunk/benchmark_autochunk.py
+++ b/tests/test_autochunk/benchmark_autochunk.py
@@ -104,8 +104,8 @@ def benchmark_evoformer():
     model = evoformer_base().cuda()
 
     # build autochunk model
-    # max_memory = 1000  # MB fit memory mode
-    max_memory = None  # min memory mode
+    max_memory = 1000  # MB fit memory mode
+    # max_memory = None  # min memory mode
     autochunk = _build_autochunk(evoformer_base().cuda(), max_memory, node, pair)
 
     # build openfold

From c3a2bf48b447a5e051bcae5d694ff5dd7beda54a Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 17:31:59 +0800
Subject: [PATCH 098/209] code style

---
 colossalai/autochunk/autochunk_codegen.py | 14 +++++-----
 colossalai/autochunk/reorder_graph.py     | 33 ++++++++++++-----------
 colossalai/autochunk/search_chunk.py      | 11 +++++---
 colossalai/autochunk/select_chunk.py      | 12 ++++-----
 colossalai/autochunk/trace_flow.py        | 12 ++++++---
 5 files changed, 46 insertions(+), 36 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 39728cb794f7..891753faae6d 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -103,7 +103,7 @@ def emit_code_with_chunk(
     nodes,
     emit_node_func,
     delete_unused_value_func,
-    chunk_region_search: SearchChunk,
+    search_chunk: SearchChunk,
     chunk_infos,
 ):
     """Emit code with nested activation checkpoint
@@ -133,7 +133,7 @@ def emit_code_with_chunk(
     chunk_outputs = [i["outputs"][0] for i in chunk_infos]
     chunk_outputs_dim = [i["outputs_dim"] for i in chunk_infos]
 
-    node_list = chunk_region_search.reorder_graph.reorder_node_list(node_list)
+    node_list = search_chunk.reorder_graph.reorder_node_list(node_list)
     node_idx = 0
     region_idx = 0
     within_chunk_region = False
@@ -167,7 +167,7 @@ def emit_code_with_chunk(
                         )
             # ones like
             if "ones_like" in node.name:
-                meta_node = chunk_region_search.trace_index.node_list[node_idx]
+                meta_node = search_chunk.trace_index.node_list[node_idx]
                 chunk_dim = chunk_infos[region_idx]["node_chunk_dim"][meta_node][
                     "chunk_dim"
                 ]
@@ -220,10 +220,8 @@ def __init__(self, meta_graph, max_memory=None, print_mem=False):
             self.max_memory = max_memory
             self.meta_node = list(meta_graph.graph.nodes)
             # find the chunk regions
-            self.chunk_region_search = SearchChunk(
-                meta_graph, max_memory, print_mem
-            )
-            self.chunk_infos = self.chunk_region_search.search_region()
+            self.search_chunk = SearchChunk(meta_graph, max_memory, print_mem)
+            self.chunk_infos = self.search_chunk.search_region()
 
         def _gen_python_code(
             self, nodes, root_module: str, namespace: _Namespace
@@ -458,7 +456,7 @@ def emit_node(node: Node, body):
                 nodes,
                 emit_node,
                 delete_unused_values,
-                self.chunk_region_search,
+                self.search_chunk,
                 self.chunk_infos,
             )
 
diff --git a/colossalai/autochunk/reorder_graph.py b/colossalai/autochunk/reorder_graph.py
index 7b9f4a20d6ab..bf4420eac7ee 100644
--- a/colossalai/autochunk/reorder_graph.py
+++ b/colossalai/autochunk/reorder_graph.py
@@ -3,28 +3,31 @@
 
 
 class ReorderGraph(object):
-    def __init__(self, index_tracer: TraceIndex) -> None:
-        self.index_tracer = index_tracer
-        self.all_reorder_map = {i: i for i in range(len(self.index_tracer.idx_trace_list))}
+    def __init__(self, trace_index: TraceIndex) -> None:
+        self.trace_index = trace_index
+        self.all_reorder_map = {
+            i: i for i in range(len(self.trace_index.idx_trace_list))
+        }
 
     def _get_reorder_map(self, chunk_info):
-        reorder_map = {i: i for i in range(len(self.index_tracer.node_list))}
+        reorder_map = {i: i for i in range(len(self.trace_index.node_list))}
 
         chunk_region_start = chunk_info["region"][0]
         chunk_region_end = chunk_info["region"][1]
         chunk_prepose_nodes = chunk_info["args"]["prepose_nodes"]
         chunk_prepose_nodes_idx = [
-            find_idx_by_name(i.name, self.index_tracer.node_list) for i in chunk_prepose_nodes
+            find_idx_by_name(i.name, self.trace_index.node_list)
+            for i in chunk_prepose_nodes
         ]
         # put prepose nodes ahead
         for idx, n in enumerate(chunk_prepose_nodes):
             n_idx = chunk_prepose_nodes_idx[idx]
             reorder_map[n_idx] = chunk_region_start + idx
         # put other nodes after prepose nodes
-        for n in self.index_tracer.node_list[chunk_region_start : chunk_region_end + 1]:
+        for n in self.trace_index.node_list[chunk_region_start : chunk_region_end + 1]:
             if n in chunk_prepose_nodes:
                 continue
-            n_idx = find_idx_by_name(n.name, self.index_tracer.node_list)
+            n_idx = find_idx_by_name(n.name, self.trace_index.node_list)
             pos = sum([n_idx < i for i in chunk_prepose_nodes_idx])
             reorder_map[n_idx] = n_idx + pos
 
@@ -50,25 +53,25 @@ def _update_all_reorder_map(self, reorder_map):
             self.all_reorder_map[origin_idx] = reorder_map[map_idx]
 
     def _reorder_self_node_list(self, reorder_map):
-        new_node_list = [None for _ in range(len(self.index_tracer.node_list))]
+        new_node_list = [None for _ in range(len(self.trace_index.node_list))]
         for old_idx, new_idx in reorder_map.items():
-            new_node_list[new_idx] = self.index_tracer.node_list[old_idx]
-        self.index_tracer.node_list = new_node_list
+            new_node_list[new_idx] = self.trace_index.node_list[old_idx]
+        self.trace_index.node_list = new_node_list
 
     def _reorder_idx_trace(self, reorder_map):
         # reorder list
-        new_idx_trace_list = [None for _ in range(len(self.index_tracer.idx_trace_list))]
+        new_idx_trace_list = [None for _ in range(len(self.trace_index.idx_trace_list))]
         for old_idx, new_idx in reorder_map.items():
-            new_idx_trace_list[new_idx] = self.index_tracer.idx_trace_list[old_idx]
-        self.index_tracer.idx_trace_list = new_idx_trace_list
+            new_idx_trace_list[new_idx] = self.trace_index.idx_trace_list[old_idx]
+        self.trace_index.idx_trace_list = new_idx_trace_list
         # update compute
-        for idx_trace in self.index_tracer.idx_trace_list:
+        for idx_trace in self.trace_index.idx_trace_list:
             compute = idx_trace["compute"]
             for dim_compute in compute:
                 for idx, i in enumerate(dim_compute):
                     dim_compute[idx] = reorder_map[i]
         # update source
-        for idx_trace in self.index_tracer.idx_trace_list:
+        for idx_trace in self.trace_index.idx_trace_list:
             source = idx_trace["source"]
             for dim_idx, dim_source in enumerate(source):
                 new_dim_source = {}
diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index 030b13bdb9c4..e2c8de74e012 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -1,10 +1,10 @@
 import copy
 
-from .select_chunk import SelectChunk
-from .trace_index import TraceIndex
-from .reorder_graph import ReorderGraph
 from .estiamte_memory import EstimateMemory
+from .reorder_graph import ReorderGraph
+from .select_chunk import SelectChunk
 from .trace_flow import TraceFlow
+from .trace_index import TraceIndex
 from .utils import (
     get_node_shape,
     is_non_compute_node,
@@ -22,7 +22,10 @@ def __init__(self, gm, max_memory=None, print_mem=False) -> None:
         self.reorder_graph = ReorderGraph(self.trace_index)
         self.estimate_memory = EstimateMemory()
         self.select_chunk = SelectChunk(
-            self.trace_index, self.estimate_memory, self.reorder_graph, max_memory=max_memory
+            self.trace_index,
+            self.estimate_memory,
+            self.reorder_graph,
+            max_memory=max_memory,
         )
 
     def _find_peak_node(self, mem_peak):
diff --git a/colossalai/autochunk/select_chunk.py b/colossalai/autochunk/select_chunk.py
index 30f4226f54ec..bdc64528ef18 100644
--- a/colossalai/autochunk/select_chunk.py
+++ b/colossalai/autochunk/select_chunk.py
@@ -1,19 +1,19 @@
-from .trace_index import TraceIndex
-from .reorder_graph import ReorderGraph
 from .estiamte_memory import EstimateMemory
+from .reorder_graph import ReorderGraph
+from .trace_index import TraceIndex
 from .utils import is_non_compute_node
 
 
 class SelectChunk(object):
     def __init__(
         self,
-        index_tracer: TraceIndex,
-        memory_estimator: EstimateMemory,
+        trace_index: TraceIndex,
+        estimate_memory: EstimateMemory,
         reorder_graph: ReorderGraph,
         max_memory=None,
     ):
-        self.index_tracer = index_tracer
-        self.memory_estimator = memory_estimator
+        self.index_tracer = trace_index
+        self.memory_estimator = estimate_memory
         self.reorder_graph = reorder_graph
         if max_memory is not None:
             self.stratge = "fit_memory"
diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py
index f372fa91335f..7139e7e047ef 100644
--- a/colossalai/autochunk/trace_flow.py
+++ b/colossalai/autochunk/trace_flow.py
@@ -81,7 +81,9 @@ def check_index_duplicate(self, chunk_infos, return_dim=False):
         input_dim_after_node = {}
         for input_node_idx, input_node in enumerate(chunk_infos["inputs"]):
             for k, v in chunk_infos["inputs_dim"][input_node_idx].items():
-                inherit_dim = self._find_inherit_dim(input_node, v, self.trace_index.node_list[k])
+                inherit_dim = self._find_inherit_dim(
+                    input_node, v, self.trace_index.node_list[k]
+                )
                 if inherit_dim:
                     input_dim_after_node[k] = inherit_dim
 
@@ -217,7 +219,9 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx):
                         for arg in arg_list:
                             if not (
                                 start_idx
-                                <= find_idx_by_name(arg.name, self.trace_index.node_list)
+                                <= find_idx_by_name(
+                                    arg.name, self.trace_index.node_list
+                                )
                                 < end_idx
                             ):
                                 continue
@@ -255,7 +259,9 @@ def _get_input_nodes_dim(self, inputs, start_idx, end_idx, all_node_info):
                 if start_idx <= user_idx <= end_idx:
                     chunk_dim = all_node_info[user]["chunk_dim"]
                     if chunk_dim is not None:
-                        user_source = self.trace_index._find_source_trace_from_node(user)[chunk_dim]
+                        user_source = self.trace_index._find_source_trace_from_node(
+                            user
+                        )[chunk_dim]
                         if input_node_idx in user_source:
                             input_dict[user_idx] = user_source[input_node_idx]
                         else:

From 8a989a0d89418c308c1d97b4d692a4e753395732 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 6 Jan 2023 17:55:22 +0800
Subject: [PATCH 099/209] code style

---
 colossalai/autochunk/autochunk_codegen.py | 69 +++++++++++++----------
 1 file changed, 40 insertions(+), 29 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 891753faae6d..0db2e59080dd 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -98,6 +98,39 @@ def _replace_reshape_size(context, node_name, reshape_size_dict):
     return context
 
 
+def _replace_ones_like(search_chunk, chunk_infos, region_idx, node_idx, node, body):
+    if "ones_like" in node.name:
+        meta_node = search_chunk.trace_index.node_list[node_idx]
+        chunk_dim = chunk_infos[region_idx]["node_chunk_dim"][meta_node]["chunk_dim"]
+        if get_node_shape(meta_node)[chunk_dim] != 1:
+            source_node = meta_node.args[0].args[0]
+            if (
+                source_node not in chunk_infos[region_idx]["node_chunk_dim"]
+                or chunk_infos[region_idx]["node_chunk_dim"][source_node]["chunk_dim"]
+                is None
+            ):
+                chunk_slice = _gen_chunk_slice_dim(
+                    chunk_dim, "chunk_idx", get_node_shape(node)
+                )
+                body[-1] = _replace_name(
+                    body[-1], node.args[0].name, node.args[0].name + chunk_slice
+                )
+    return body
+
+
+def _replace_input_var(chunk_inputs, region_idx, chunk_inputs_dim, node_idx, body):
+    for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
+        for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
+            if idx == node_idx:
+                chunk_slice = _gen_chunk_slice_dim(
+                    dim[0], "chunk_idx", get_node_shape(input_node)
+                )
+                body[-1] = _replace_name(
+                    body[-1], input_node.name, input_node.name + chunk_slice
+                )
+    return body
+
+
 def emit_code_with_chunk(
     body,
     nodes,
@@ -156,36 +189,14 @@ def emit_code_with_chunk(
         if within_chunk_region:
             emit_node_func(node, body)
             # replace input var with chunk var
-            for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
-                for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
-                    if idx == node_idx:
-                        chunk_slice = _gen_chunk_slice_dim(
-                            dim[0], "chunk_idx", get_node_shape(input_node)
-                        )
-                        body[-1] = _replace_name(
-                            body[-1], input_node.name, input_node.name + chunk_slice
-                        )
+            body = _replace_input_var(
+                chunk_inputs, region_idx, chunk_inputs_dim, node_idx, body
+            )
             # ones like
-            if "ones_like" in node.name:
-                meta_node = search_chunk.trace_index.node_list[node_idx]
-                chunk_dim = chunk_infos[region_idx]["node_chunk_dim"][meta_node][
-                    "chunk_dim"
-                ]
-                if get_node_shape(meta_node)[chunk_dim] != 1:
-                    source_node = meta_node.args[0].args[0]
-                    if (
-                        source_node not in chunk_infos[region_idx]["node_chunk_dim"]
-                        or chunk_infos[region_idx]["node_chunk_dim"][source_node][
-                            "chunk_dim"
-                        ]
-                        is None
-                    ):
-                        chunk_slice = _gen_chunk_slice_dim(
-                            chunk_dim, "chunk_idx", get_node_shape(node)
-                        )
-                        body[-1] = _replace_name(
-                            body[-1], node.args[0].name, node.args[0].name + chunk_slice
-                        )
+            body = _replace_ones_like(
+                search_chunk, chunk_infos, region_idx, node_idx, node, body
+            )
+            # reassgin reshape size
             body[-1] = _replace_reshape_size(
                 body[-1], node.name, chunk_infos[region_idx]["reshape_size"]
             )

From 4d223e18a2600ca2467fb21ef4c18f0e9aa0d04c Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 13:46:17 +0800
Subject: [PATCH 100/209] fix typo

---
 colossalai/autochunk/{estiamte_memory.py => estimate_memory.py} | 0
 colossalai/autochunk/search_chunk.py                            | 2 +-
 colossalai/autochunk/select_chunk.py                            | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename colossalai/autochunk/{estiamte_memory.py => estimate_memory.py} (100%)

diff --git a/colossalai/autochunk/estiamte_memory.py b/colossalai/autochunk/estimate_memory.py
similarity index 100%
rename from colossalai/autochunk/estiamte_memory.py
rename to colossalai/autochunk/estimate_memory.py
diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index e2c8de74e012..21b967497f1b 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -1,6 +1,6 @@
 import copy
 
-from .estiamte_memory import EstimateMemory
+from .estimate_memory import EstimateMemory
 from .reorder_graph import ReorderGraph
 from .select_chunk import SelectChunk
 from .trace_flow import TraceFlow
diff --git a/colossalai/autochunk/select_chunk.py b/colossalai/autochunk/select_chunk.py
index bdc64528ef18..7127cfd64e69 100644
--- a/colossalai/autochunk/select_chunk.py
+++ b/colossalai/autochunk/select_chunk.py
@@ -1,4 +1,4 @@
-from .estiamte_memory import EstimateMemory
+from .estimate_memory import EstimateMemory
 from .reorder_graph import ReorderGraph
 from .trace_index import TraceIndex
 from .utils import is_non_compute_node

From cb68ee864a21e330e8061ee13811a7045f3d65f3 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 14:20:41 +0800
Subject: [PATCH 101/209] set benchmark

---
 tests/test_autochunk/benchmark_autochunk.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_autochunk/benchmark_autochunk.py b/tests/test_autochunk/benchmark_autochunk.py
index 7a9d8cdeee03..6632ece61376 100644
--- a/tests/test_autochunk/benchmark_autochunk.py
+++ b/tests/test_autochunk/benchmark_autochunk.py
@@ -98,14 +98,14 @@ def _build_openfold():
 def benchmark_evoformer():
     # init data and model
     msa_len = 256
-    pair_len = 256
+    pair_len = 512
     node = torch.randn(1, msa_len, pair_len, 256).cuda()
     pair = torch.randn(1, pair_len, pair_len, 128).cuda()
     model = evoformer_base().cuda()
 
     # build autochunk model
-    max_memory = 1000  # MB fit memory mode
-    # max_memory = None  # min memory mode
+    # max_memory = 1000  # MB, fit memory mode
+    max_memory = None  # min memory mode
     autochunk = _build_autochunk(evoformer_base().cuda(), max_memory, node, pair)
 
     # build openfold

From 18a51c87fe0aa3a1210d7484fc09c16714e04bb7 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 14:20:54 +0800
Subject: [PATCH 102/209] rename test

---
 .../{test_autochunk.py => test_autochunk_codegen.py}          | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)
 rename tests/test_autochunk/{test_autochunk.py => test_autochunk_codegen.py} (97%)

diff --git a/tests/test_autochunk/test_autochunk.py b/tests/test_autochunk/test_autochunk_codegen.py
similarity index 97%
rename from tests/test_autochunk/test_autochunk.py
rename to tests/test_autochunk/test_autochunk_codegen.py
index 85a162084cc9..1c5dd939d710 100644
--- a/tests/test_autochunk/test_autochunk.py
+++ b/tests/test_autochunk/test_autochunk_codegen.py
@@ -18,9 +18,7 @@ def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
     torch.cuda.reset_peak_memory_stats()
     now_mem = torch.cuda.memory_allocated() / 1024**2
     with torch.no_grad():
-        node1 = node.clone()
-        pair1 = pair.clone()
-        gm(node1, pair1)
+        gm(node.clone(), pair.clone())
     new_now_mem = torch.cuda.memory_allocated() / 1024**2
     new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
     print(

From 74b81395a2edbce36896f3d184c6cfae327024b5 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 14:26:22 +0800
Subject: [PATCH 103/209] update codegen test

---
 .../test_autochunk/test_autochunk_codegen.py  | 42 +++++++++++--------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/tests/test_autochunk/test_autochunk_codegen.py b/tests/test_autochunk/test_autochunk_codegen.py
index 1c5dd939d710..8246275eb08a 100644
--- a/tests/test_autochunk/test_autochunk_codegen.py
+++ b/tests/test_autochunk/test_autochunk_codegen.py
@@ -15,16 +15,19 @@
 
 
 def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
-    torch.cuda.reset_peak_memory_stats()
-    now_mem = torch.cuda.memory_allocated() / 1024**2
-    with torch.no_grad():
-        gm(node.clone(), pair.clone())
-    new_now_mem = torch.cuda.memory_allocated() / 1024**2
-    new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
-    print(
-        "autochunk now mem:%.2f max mem:%.2f"
-        % (new_now_mem - now_mem, new_max_mem - now_mem)
-    )
+    # for memory test
+    # torch.cuda.reset_peak_memory_stats()
+    # now_mem = torch.cuda.memory_allocated() / 1024**2
+    # with torch.no_grad():
+    #     node1 = node.clone()
+    #     pair1 = pair.clone()
+    #     gm(node1, pair1)
+    # new_now_mem = torch.cuda.memory_allocated() / 1024**2
+    # new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
+    # print(
+    #     "autochunk now mem:%.2f max mem:%.2f"
+    #     % (new_now_mem - now_mem, new_max_mem - now_mem)
+    # )
 
     # test forward
     with torch.no_grad():
@@ -43,7 +46,7 @@ def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
     )
 
 
-def _run_offload_codegen(rank):
+def _test_autochunk_codegen(rank):
     # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
     colossalai.launch(
         config={},
@@ -56,8 +59,10 @@ def _run_offload_codegen(rank):
 
     # build model and input
     model = evoformer_base().cuda()
-    node = torch.randn(1, 100, 300, 256).cuda()
-    pair = torch.randn(1, 300, 300, 128).cuda()
+    msa_len = 32
+    pair_len = 64
+    node = torch.randn(1, msa_len, pair_len, 256).cuda()
+    pair = torch.randn(1, pair_len, pair_len, 128).cuda()
 
     # trace the module and replace codegen
     graph = ColoTracer().trace(
@@ -85,17 +90,18 @@ def _run_offload_codegen(rank):
     gm = ColoGraphModule(model, graph)
     gm.recompile()
 
-    # assert we have all the components
-    # code = graph.python_code("self").src
+    # assert we have inserted chunk
+    code = graph.python_code("self").src
+    assert "chunk_size" in code
     # print(code)
 
     _test_fwd(model, gm, node, pair)
     gpc.destroy()
 
 
-def test_autochunk():
-    mp.spawn(_run_offload_codegen, nprocs=1)
+def test_autochunk_codegen():
+    mp.spawn(_test_autochunk_codegen, nprocs=1)
 
 
 if __name__ == "__main__":
-    _run_offload_codegen(0)
+    _test_autochunk_codegen(0)

From 9880fd2cd8b3b24c28333926338656a06dd170f3 Mon Sep 17 00:00:00 2001
From: eric8607242 <e0928021388@gmail.com>
Date: Mon, 9 Jan 2023 14:35:14 +0800
Subject: [PATCH 104/209] Fix state_dict key missing issue of the ZeroDDP
 (#2363)

* Fix state_dict output for ZeroDDP duplicated parameters

* Rewrite state_dict based on get_static_torch_model

* Modify get_static_torch_model to be compatible with the lower version (ZeroDDP)
---
 colossalai/nn/parallel/data_parallel.py | 37 +++++++++++++++++++++----
 colossalai/nn/parallel/utils.py         | 16 +++++------
 2 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/colossalai/nn/parallel/data_parallel.py b/colossalai/nn/parallel/data_parallel.py
index e3bb83347d21..8fd08db957b7 100644
--- a/colossalai/nn/parallel/data_parallel.py
+++ b/colossalai/nn/parallel/data_parallel.py
@@ -18,6 +18,7 @@
 from colossalai.zero.utils.gemini_hook import GeminiZeROHook
 
 from .reducer import Reducer
+from .utils import get_static_torch_model
 
 try:
     from torch.nn.modules.module import _EXTRA_STATE_KEY_SUFFIX, _IncompatibleKeys
@@ -251,6 +252,7 @@ def __init__(self,
                                                pin_memory=pin_memory)
             self.fp32_params.append(fp32_p)
             self.grads_device[p] = self.gemini_manager.default_device
+
         self.chunk_manager.close_all_groups()
         self._cast_buffers()
 
@@ -331,12 +333,11 @@ def set_chunk_grad_device(self, chunk: Chunk, device: torch.device) -> None:
         for tensor in chunk.get_tensors():
             self.grads_device[tensor] = device
 
-    def state_dict(self, destination=None, prefix='', keep_vars=False, only_rank_0: bool = True):
-        r"""Returns a dictionary containing a whole state of the module.
-
-        Both parameters and persistent buffers (e.g. running averages) are
-        included. Keys are corresponding parameter and buffer names.
-        Parameters and buffers set to ``None`` are not included.
+    def state_dict(self, destination=None, prefix='', keep_vars=False, only_rank_0: bool = True, strict: bool = True):
+        r"""
+        Args:
+            strict (bool): whether to reture the whole model state
+                as the original pytorch state_dict()
 
         Returns:
             dict:
@@ -346,7 +347,31 @@ def state_dict(self, destination=None, prefix='', keep_vars=False, only_rank_0:
 
             >>> module.state_dict().keys()
             ['bias', 'weight']
+        """
+        if strict:
+            return get_static_torch_model(zero_ddp_model=self, device=get_current_device(),
+                                          only_rank_0=only_rank_0).state_dict(destination=destination,
+                                                                              prefix=prefix,
+                                                                              keep_vars=keep_vars)
+        return self._non_strict_state_dict(destination=destination,
+                                           prefix=prefix,
+                                           keep_vars=keep_vars,
+                                           only_rank_0=only_rank_0)
+
+    def _non_strict_state_dict(self, destination=None, prefix='', keep_vars=False, only_rank_0: bool = True):
+        r"""Returns a dictionary containing a whole state of the module.
+
+        Both parameters and persistent buffers (e.g. running averages) are
+        included. Keys are corresponding parameter and buffer names.
+        Parameters and buffers set to ``None`` are not included.
 
+        Warning: The non strict state dict would ignore the parameters if the
+            tensors of the parameters are shared with other parameters which
+            have been included in the dictionary.
+
+        Returns:
+            dict:
+                a dictionary containing a whole state of the module
         """
         if destination is None:
             destination = OrderedDict()
diff --git a/colossalai/nn/parallel/utils.py b/colossalai/nn/parallel/utils.py
index 1205cbc3a658..988f978254a1 100644
--- a/colossalai/nn/parallel/utils.py
+++ b/colossalai/nn/parallel/utils.py
@@ -60,17 +60,17 @@ def _get_shallow_copy_model(model: nn.Module):
     return name_to_module['']
 
 
-def get_static_torch_model(gemini_ddp_model,
+def get_static_torch_model(zero_ddp_model,
                            device=torch.device("cpu"),
                            dtype=torch.float32,
                            only_rank_0=True) -> torch.nn.Module:
-    """Get a static torch.nn.Module model from the given GeminiDDP module.
-    You should notice that the original GeminiDDP model is not modified.
+    """Get a static torch.nn.Module model from the given ZeroDDP module.
+    You should notice that the original ZeroDDP model is not modified.
     Thus, you can use the original model in further training.
     But you should not use the returned torch model to train, this can cause unexpected errors.
 
     Args:
-        gemini_ddp_model (GeminiDDP): a gemini ddp model
+        zero_ddp_model (ZeroDDP): a zero ddp model
         device (torch.device): the device of the final torch model
         dtype (torch.dtype): the dtype of the final torch model
         only_rank_0 (bool): if True, only rank0 has the coverted torch model
@@ -78,11 +78,11 @@ def get_static_torch_model(gemini_ddp_model,
     Returns:
         torch.nn.Module: a static torch model used for saving checkpoints or numeric checks
     """
-    from colossalai.nn.parallel import GeminiDDP
-    assert isinstance(gemini_ddp_model, GeminiDDP)
+    from colossalai.nn.parallel import ZeroDDP
+    assert isinstance(zero_ddp_model, ZeroDDP)
 
-    state_dict = gemini_ddp_model.state_dict(only_rank_0=only_rank_0)
-    colo_model = gemini_ddp_model.module
+    state_dict = zero_ddp_model.state_dict(only_rank_0=only_rank_0, strict=False)
+    colo_model = zero_ddp_model.module
     torch_model = _get_shallow_copy_model(colo_model)
 
     if not only_rank_0 or dist.get_rank() == 0:

From 3abbaf8bc68c8a3366241a3dc2e97f6944605fb2 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 14:53:04 +0800
Subject: [PATCH 105/209] update codegen test

---
 .../test_autochunk/test_autochunk_codegen.py  | 23 +++++++++++++------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/tests/test_autochunk/test_autochunk_codegen.py b/tests/test_autochunk/test_autochunk_codegen.py
index 8246275eb08a..c91148e11ff8 100644
--- a/tests/test_autochunk/test_autochunk_codegen.py
+++ b/tests/test_autochunk/test_autochunk_codegen.py
@@ -1,3 +1,5 @@
+from functools import partial
+
 import pytest
 import torch
 import torch.fx
@@ -46,7 +48,7 @@ def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
     )
 
 
-def _test_autochunk_codegen(rank):
+def _test_autochunk_codegen(rank, msa_len, pair_len, max_memory):
     # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
     colossalai.launch(
         config={},
@@ -59,8 +61,6 @@ def _test_autochunk_codegen(rank):
 
     # build model and input
     model = evoformer_base().cuda()
-    msa_len = 32
-    pair_len = 64
     node = torch.randn(1, msa_len, pair_len, 256).cuda()
     pair = torch.randn(1, pair_len, pair_len, 128).cuda()
 
@@ -85,7 +85,7 @@ def _test_autochunk_codegen(rank):
         MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
     )
 
-    codegen = AutoChunkCodeGen(gm_prop)
+    codegen = AutoChunkCodeGen(gm_prop, max_memory=max_memory)
     graph.set_codegen(codegen)
     gm = ColoGraphModule(model, graph)
     gm.recompile()
@@ -99,9 +99,18 @@ def _test_autochunk_codegen(rank):
     gpc.destroy()
 
 
-def test_autochunk_codegen():
-    mp.spawn(_test_autochunk_codegen, nprocs=1)
+@pytest.mark.parametrize("max_memory", [None, 20, 24, 28, 32])
+@pytest.mark.parametrize("msa_len", [32])
+@pytest.mark.parametrize("pair_len", [64])
+def test_autochunk_codegen(msa_len, pair_len, max_memory):
+    run_func = partial(
+        _test_autochunk_codegen,
+        msa_len=msa_len,
+        pair_len=pair_len,
+        max_memory=max_memory,
+    )
+    mp.spawn(run_func, nprocs=1)
 
 
 if __name__ == "__main__":
-    _test_autochunk_codegen(0)
+    _test_autochunk_codegen(0, 32, 64, None)

From a005965d2d5f506aafe672575388501bfc5dc5d8 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 14:57:47 +0800
Subject: [PATCH 106/209] update codegen test

---
 tests/test_autochunk/test_autochunk_codegen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_autochunk/test_autochunk_codegen.py b/tests/test_autochunk/test_autochunk_codegen.py
index c91148e11ff8..62763a6d5e2a 100644
--- a/tests/test_autochunk/test_autochunk_codegen.py
+++ b/tests/test_autochunk/test_autochunk_codegen.py
@@ -99,7 +99,7 @@ def _test_autochunk_codegen(rank, msa_len, pair_len, max_memory):
     gpc.destroy()
 
 
-@pytest.mark.parametrize("max_memory", [None, 20, 24, 28, 32])
+@pytest.mark.parametrize("max_memory", [None, 20, 25, 30])
 @pytest.mark.parametrize("msa_len", [32])
 @pytest.mark.parametrize("pair_len", [64])
 def test_autochunk_codegen(msa_len, pair_len, max_memory):

From d106b271f8fa8968bfa7a5f7652448c41f26c260 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 15:19:08 +0800
Subject: [PATCH 107/209] add chunk search test

---
 tests/test_autochunk/test_autochunk_search.py | 86 +++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100644 tests/test_autochunk/test_autochunk_search.py

diff --git a/tests/test_autochunk/test_autochunk_search.py b/tests/test_autochunk/test_autochunk_search.py
new file mode 100644
index 000000000000..c824a43ab612
--- /dev/null
+++ b/tests/test_autochunk/test_autochunk_search.py
@@ -0,0 +1,86 @@
+from functools import partial
+
+import pytest
+import torch
+import torch.fx
+import torch.multiprocessing as mp
+
+import colossalai
+from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
+from colossalai.core import global_context as gpc
+from colossalai.fx import ColoTracer
+from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp
+from colossalai.fx.profiler import MetaTensor
+from colossalai.utils import free_port
+from tests.test_autochunk.evoformer.evoformer import evoformer_base
+
+
+def assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len):
+    found_regions = [i["region"] for i in chunk_infos]
+
+    if msa_len == 32 and pair_len == 64:
+        if max_memory is None:
+            target_regions = [(142, 154), (366, 373), (233, 283), (301, 351), (127, 134), (204, 228), (167, 191), (161, 166), (198, 203), (6, 69)]
+        elif max_memory == 20:
+            target_regions = [(142, 154), (369, 373), (233, 269), (301, 351)]
+        elif max_memory == 25:
+            target_regions = [(144, 154), (369, 370)]
+        elif max_memory == 30:
+            target_regions = [(144, 154)]
+        else:
+            raise NotImplementedError()
+    else:
+        raise NotImplementedError()
+    
+    assert len(found_regions) == len(target_regions), "len of found regions %s doesn't equal len of target regions %s" % (str(found_regions), str(target_regions))
+    for region in target_regions:
+        assert region in found_regions, "region:%s not in found regions for msa:%d, pair:%d, maxmem:%d" % (str(region), msa_len, pair_len, max_memory)
+    for region in found_regions:
+        assert region in target_regions, "region:%s should not be found for msa:%d, pair:%d, maxmem:%d" % (str(region), msa_len, pair_len, max_memory)
+
+
+def _test_autochunk_search(rank, msa_len, pair_len, max_memory):
+    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
+    colossalai.launch(
+        config={},
+        rank=rank,
+        world_size=1,
+        host="localhost",
+        port=free_port(),
+        backend="nccl",
+    )
+
+    # build model and input
+    model = evoformer_base().cuda()
+    node = torch.randn(1, msa_len, pair_len, 256).cuda()
+    pair = torch.randn(1, pair_len, pair_len, 128).cuda()
+
+    gm_prop = torch.fx.symbolic_trace(model)  # must use symbolic_trace
+    interp = MetaInfoProp(gm_prop)
+    interp.propagate(
+        MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
+    )
+
+    codegen = AutoChunkCodeGen(gm_prop, max_memory=max_memory)
+    chunk_infos = codegen.chunk_infos
+    assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len)
+
+    gpc.destroy()
+
+
+@pytest.mark.parametrize("max_memory", [None, 20, 25, 30])
+@pytest.mark.parametrize("msa_len", [32])
+@pytest.mark.parametrize("pair_len", [64])
+def test_autochunk_search(msa_len, pair_len, max_memory):
+    run_func = partial(
+        _test_autochunk_search,
+        msa_len=msa_len,
+        pair_len=pair_len,
+        max_memory=max_memory,
+    )
+    mp.spawn(run_func, nprocs=1)
+
+
+if __name__ == "__main__":
+    _test_autochunk_search(0, 32, 64, 20)

From d5c4f0bf954a5686777f652e34b5cd18df2a0d5a Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 15:22:09 +0800
Subject: [PATCH 108/209] code style

---
 tests/test_autochunk/test_autochunk_search.py | 29 +++++++++++++++----
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/tests/test_autochunk/test_autochunk_search.py b/tests/test_autochunk/test_autochunk_search.py
index c824a43ab612..6f7214633fa3 100644
--- a/tests/test_autochunk/test_autochunk_search.py
+++ b/tests/test_autochunk/test_autochunk_search.py
@@ -8,8 +8,6 @@
 import colossalai
 from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
 from colossalai.core import global_context as gpc
-from colossalai.fx import ColoTracer
-from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 from colossalai.fx.profiler import MetaTensor
 from colossalai.utils import free_port
@@ -32,12 +30,31 @@ def assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len):
             raise NotImplementedError()
     else:
         raise NotImplementedError()
-    
-    assert len(found_regions) == len(target_regions), "len of found regions %s doesn't equal len of target regions %s" % (str(found_regions), str(target_regions))
+
+    assert len(found_regions) == len(
+        target_regions
+    ), "len of found regions %s doesn't equal len of target regions %s" % (
+        str(found_regions),
+        str(target_regions),
+    )
     for region in target_regions:
-        assert region in found_regions, "region:%s not in found regions for msa:%d, pair:%d, maxmem:%d" % (str(region), msa_len, pair_len, max_memory)
+        assert (
+            region in found_regions
+        ), "region:%s not in found regions for msa:%d, pair:%d, maxmem:%d" % (
+            str(region),
+            msa_len,
+            pair_len,
+            max_memory,
+        )
     for region in found_regions:
-        assert region in target_regions, "region:%s should not be found for msa:%d, pair:%d, maxmem:%d" % (str(region), msa_len, pair_len, max_memory)
+        assert (
+            region in target_regions
+        ), "region:%s should not be found for msa:%d, pair:%d, maxmem:%d" % (
+            str(region),
+            msa_len,
+            pair_len,
+            max_memory,
+        )
 
 
 def _test_autochunk_search(rank, msa_len, pair_len, max_memory):

From aafc3516a5c07347f58bbc1a52410f74e51b685f Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 15:32:19 +0800
Subject: [PATCH 109/209] add available

---
 tests/test_autochunk/test_autochunk_codegen.py | 2 ++
 tests/test_autochunk/test_autochunk_search.py  | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/tests/test_autochunk/test_autochunk_codegen.py b/tests/test_autochunk/test_autochunk_codegen.py
index 62763a6d5e2a..c4f5cda67204 100644
--- a/tests/test_autochunk/test_autochunk_codegen.py
+++ b/tests/test_autochunk/test_autochunk_codegen.py
@@ -9,6 +9,7 @@
 from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
 from colossalai.core import global_context as gpc
 from colossalai.fx import ColoTracer
+from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
 from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 from colossalai.fx.profiler import MetaTensor
@@ -99,6 +100,7 @@ def _test_autochunk_codegen(rank, msa_len, pair_len, max_memory):
     gpc.destroy()
 
 
+@pytest.mark.skipif(not CODEGEN_AVAILABLE, reason='torch version is lower than 1.12.0')
 @pytest.mark.parametrize("max_memory", [None, 20, 25, 30])
 @pytest.mark.parametrize("msa_len", [32])
 @pytest.mark.parametrize("pair_len", [64])
diff --git a/tests/test_autochunk/test_autochunk_search.py b/tests/test_autochunk/test_autochunk_search.py
index 6f7214633fa3..5026c3ad3b3d 100644
--- a/tests/test_autochunk/test_autochunk_search.py
+++ b/tests/test_autochunk/test_autochunk_search.py
@@ -8,6 +8,7 @@
 import colossalai
 from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
 from colossalai.core import global_context as gpc
+from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 from colossalai.fx.profiler import MetaTensor
 from colossalai.utils import free_port
@@ -86,6 +87,7 @@ def _test_autochunk_search(rank, msa_len, pair_len, max_memory):
     gpc.destroy()
 
 
+@pytest.mark.skipif(not CODEGEN_AVAILABLE, reason="torch version is lower than 1.12.0")
 @pytest.mark.parametrize("max_memory", [None, 20, 25, 30])
 @pytest.mark.parametrize("msa_len", [32])
 @pytest.mark.parametrize("pair_len", [64])

From 498b5ca993fb17eccdfbe7608f36444d5779f0c8 Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Mon, 9 Jan 2023 15:52:17 +0800
Subject: [PATCH 110/209] [hotfix] fix gpt gemini example (#2404)

* [hotfix] fix gpt gemini example

* [example] add new assertions
---
 .../language/gpt/gemini/benchmark_gemini.sh   | 30 ++++++++++---------
 .../language/gpt/gemini/train_gpt_demo.py     |  2 ++
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/examples/language/gpt/gemini/benchmark_gemini.sh b/examples/language/gpt/gemini/benchmark_gemini.sh
index 13086666eefd..464ea03da7eb 100644
--- a/examples/language/gpt/gemini/benchmark_gemini.sh
+++ b/examples/language/gpt/gemini/benchmark_gemini.sh
@@ -1,18 +1,20 @@
 for MODEL_TYPE in "gpt2_medium"; do
-  for BATCH_SIZE in 16; do
-    for GPUNUM in 1 2 4 8; do
-      for TPDEGREE in 1 2 4 8; do
-        if [ ${TPDEGREE} -gt ${GPUNUM} ]; then
-          continue
-        fi
-        for PLACEMENT in "cpu" "auto"; do
-          echo "****************** Begin ***************************"
-          echo "* benchmrking MODEL_TYPE ${MODEL_TYPE} BS ${BATCH_SIZE} BS ${BS} GPUNUM ${GPUNUM} TPDEGREE ${TPDEGREE} PLACEMENT ${PLACEMENT}"
-          MODEL_TYPE=${MODEL_TYPE} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \
-          bash ./gemini/run_gemini.sh
-          echo "****************** Finished ***************************"
-          echo ""
-          echo ""
+  for DISPAN in "colossalai"; do
+    for BATCH_SIZE in 16; do
+      for GPUNUM in 1 2 4 8; do
+        for TPDEGREE in 1 2 4 8; do
+          if [ ${TPDEGREE} -gt ${GPUNUM} ]; then
+            continue
+          fi
+          for PLACEMENT in "cpu" "auto"; do
+            echo "****************** Begin ***************************"
+            echo "+ benchmrking MODEL ${MODEL_TYPE} DISPAN ${DISPAN} GPU ${GPUNUM} BS ${BATCH_SIZE} TP ${TPDEGREE} POLICY ${PLACEMENT}"
+            MODEL_TYPE=${MODEL_TYPE} DISPAN=${DISPAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \
+            bash ./run_gemini.sh
+            echo "****************** Finished ***************************"
+            echo ""
+            echo ""
+          done
         done
       done
     done
diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py
index 29f8c8ef1215..891b1de15af1 100644
--- a/examples/language/gpt/gemini/train_gpt_demo.py
+++ b/examples/language/gpt/gemini/train_gpt_demo.py
@@ -270,6 +270,7 @@ def main():
 
         tp_pg = ProcessGroup(tp_degree=args.tp_degree)
         # Tensor Parallelism (TP)
+        # You should notice that v0.1.10 is not compatible with TP degree > 1
         tensor_parallelize(model, tp_pg)
 
         # build a Gemini model and a highly optimized cpu optimizer
@@ -278,6 +279,7 @@ def main():
 
         logger.info(get_mem_info(prefix='After init optim, '), ranks=[0])
     else:
+        assert args.tp_degree == 1, "The degree of TP should be 1 for DDP examples."
         model = model_builder(args.model_type)(checkpoint=True).cuda()
 
     if args.distplan.startswith("torch"):

From 19cc64b1d39529bde502f9507d20770430f6e3af Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 16:06:58 +0800
Subject: [PATCH 111/209] remove autochunk_available

---
 colossalai/autochunk/autochunk_codegen.py | 490 +++++++++++-----------
 1 file changed, 239 insertions(+), 251 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 0db2e59080dd..9ec59477b426 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -16,13 +16,9 @@
 from torch.fx.node import Argument, Node, _get_qualified_name, _type_repr, map_arg
 
 import colossalai
-
 from .search_chunk import SearchChunk
 from .utils import delete_free_var_from_last_use, find_idx_by_name, get_node_shape
 
-CODEGEN_AVAILABLE = True
-__all__ = ["AutoChunkCodeGen"]
-
 
 def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
     new_shape = "["
@@ -222,287 +218,279 @@ def emit_code_with_chunk(
         node_idx += 1
 
 
-if CODEGEN_AVAILABLE:
-
-    class AutoChunkCodeGen(CodeGen):
-        def __init__(self, meta_graph, max_memory=None, print_mem=False):
-            super().__init__()
-            self.meta_graph = meta_graph
-            self.max_memory = max_memory
-            self.meta_node = list(meta_graph.graph.nodes)
-            # find the chunk regions
-            self.search_chunk = SearchChunk(meta_graph, max_memory, print_mem)
-            self.chunk_infos = self.search_chunk.search_region()
+class AutoChunkCodeGen(CodeGen):
+    def __init__(self, meta_graph, max_memory=None, print_mem=False):
+        super().__init__()
+        self.meta_graph = meta_graph
+        self.max_memory = max_memory
+        self.meta_node = list(meta_graph.graph.nodes)
+        # find the chunk regions
+        self.search_chunk = SearchChunk(meta_graph, max_memory, print_mem)
+        self.chunk_infos = self.search_chunk.search_region()
 
-        def _gen_python_code(
-            self, nodes, root_module: str, namespace: _Namespace
-        ) -> PythonCode:
-            free_vars: List[str] = []
-            body: List[str] = []
-            globals_: Dict[str, Any] = {}
-            wrapped_fns: Dict[str, None] = {}
+    def _gen_python_code(
+        self, nodes, root_module: str, namespace: _Namespace
+    ) -> PythonCode:
+        free_vars: List[str] = []
+        body: List[str] = []
+        globals_: Dict[str, Any] = {}
+        wrapped_fns: Dict[str, None] = {}
 
-            # Wrap string in list to pass by reference
-            maybe_return_annotation: List[str] = [""]
+        # Wrap string in list to pass by reference
+        maybe_return_annotation: List[str] = [""]
 
-            def add_global(name_hint: str, obj: Any):
-                """Add an obj to be tracked as a global.
+        def add_global(name_hint: str, obj: Any):
+            """Add an obj to be tracked as a global.
 
-                We call this for names that reference objects external to the
-                Graph, like functions or types.
+            We call this for names that reference objects external to the
+            Graph, like functions or types.
 
-                Returns: the global name that should be used to reference 'obj' in generated source.
-                """
-                if (
-                    _is_from_torch(obj) and obj != torch.device
-                ):  # to support registering torch.device
-                    # HACK: workaround for how torch custom ops are registered. We
-                    # can't import them like normal modules so they must retain their
-                    # fully qualified name.
-                    return _get_qualified_name(obj)
-
-                # normalize the name hint to get a proper identifier
-                global_name = namespace.create_name(name_hint, obj)
-
-                if global_name in globals_:
-                    assert globals_[global_name] is obj
-                    return global_name
-                globals_[global_name] = obj
+            Returns: the global name that should be used to reference 'obj' in generated source.
+            """
+            if (
+                _is_from_torch(obj) and obj != torch.device
+            ):  # to support registering torch.device
+                # HACK: workaround for how torch custom ops are registered. We
+                # can't import them like normal modules so they must retain their
+                # fully qualified name.
+                return _get_qualified_name(obj)
+
+            # normalize the name hint to get a proper identifier
+            global_name = namespace.create_name(name_hint, obj)
+
+            if global_name in globals_:
+                assert globals_[global_name] is obj
                 return global_name
+            globals_[global_name] = obj
+            return global_name
 
-            # set _custom_builtins here so that we needn't import colossalai in forward
-            _custom_builtins["colossalai"] = _CustomBuiltin(
-                "import colossalai", colossalai
-            )
-
-            # Pre-fill the globals table with registered builtins.
-            for name, (_, obj) in _custom_builtins.items():
-                add_global(name, obj)
+        # set _custom_builtins here so that we needn't import colossalai in forward
+        _custom_builtins["colossalai"] = _CustomBuiltin("import colossalai", colossalai)
 
-            def type_repr(o: Any):
-                if o == ():
-                    # Empty tuple is used for empty tuple type annotation Tuple[()]
-                    return "()"
+        # Pre-fill the globals table with registered builtins.
+        for name, (_, obj) in _custom_builtins.items():
+            add_global(name, obj)
 
-                typename = _type_repr(o)
+        def type_repr(o: Any):
+            if o == ():
+                # Empty tuple is used for empty tuple type annotation Tuple[()]
+                return "()"
 
-                if hasattr(o, "__origin__"):
-                    # This is a generic type, e.g. typing.List[torch.Tensor]
-                    origin_type = _origin_type_map.get(o.__origin__, o.__origin__)
-                    origin_typename = add_global(_type_repr(origin_type), origin_type)
+            typename = _type_repr(o)
 
-                    if hasattr(o, "__args__"):
-                        # Assign global names for each of the inner type variables.
-                        args = [type_repr(arg) for arg in o.__args__]
+            if hasattr(o, "__origin__"):
+                # This is a generic type, e.g. typing.List[torch.Tensor]
+                origin_type = _origin_type_map.get(o.__origin__, o.__origin__)
+                origin_typename = add_global(_type_repr(origin_type), origin_type)
 
-                        if len(args) == 0:
-                            # Bare type, such as `typing.Tuple` with no subscript
-                            # This code-path used in Python < 3.9
-                            return origin_typename
+                if hasattr(o, "__args__"):
+                    # Assign global names for each of the inner type variables.
+                    args = [type_repr(arg) for arg in o.__args__]
 
-                        return f'{origin_typename}[{",".join(args)}]'
-                    else:
+                    if len(args) == 0:
                         # Bare type, such as `typing.Tuple` with no subscript
-                        # This code-path used in Python 3.9+
+                        # This code-path used in Python < 3.9
                         return origin_typename
 
-                # Common case: this is a regular module name like 'foo.bar.baz'
-                return add_global(typename, o)
-
-            def _format_args(
-                args: Tuple[Argument, ...], kwargs: Dict[str, Argument]
-            ) -> str:
-                def _get_repr(arg):
-                    # Handle NamedTuples (if it has `_fields`) via add_global.
-                    if isinstance(arg, tuple) and hasattr(arg, "_fields"):
-                        qualified_name = _get_qualified_name(type(arg))
-                        global_name = add_global(qualified_name, type(arg))
-                        return f"{global_name}{repr(tuple(arg))}"
-                    return repr(arg)
-
-                args_s = ", ".join(_get_repr(a) for a in args)
-                kwargs_s = ", ".join(f"{k} = {_get_repr(v)}" for k, v in kwargs.items())
-                if args_s and kwargs_s:
-                    return f"{args_s}, {kwargs_s}"
-                return args_s or kwargs_s
-
-            # Run through reverse nodes and record the first instance of a use
-            # of a given node. This represents the *last* use of the node in the
-            # execution order of the program, which we will use to free unused
-            # values
-            node_to_last_use: Dict[Node, Node] = {}
-            user_to_last_uses: Dict[Node, List[Node]] = {}
-
-            def register_last_uses(n: Node, user: Node):
-                if n not in node_to_last_use:
-                    node_to_last_use[n] = user
-                    user_to_last_uses.setdefault(user, []).append(n)
-
-            for node in reversed(nodes):
-                map_arg(node.args, lambda n: register_last_uses(n, node))
-                map_arg(node.kwargs, lambda n: register_last_uses(n, node))
-
-            delete_free_var_from_last_use(user_to_last_uses)
-
-            # NOTE: we add a variable to distinguish body and ckpt_func
-            def delete_unused_values(user: Node, body, to_keep=[]):
-                """
-                Delete values after their last use. This ensures that values that are
-                not used in the remainder of the code are freed and the memory usage
-                of the code is optimal.
-                """
-                if user.op == "placeholder":
-                    return
-                if user.op == "output":
-                    body.append("\n")
-                    return
-                nodes_to_delete = user_to_last_uses.get(user, [])
-                nodes_to_delete = [i for i in nodes_to_delete if i.name not in to_keep]
-                if len(nodes_to_delete):
-                    to_delete_str = " = ".join(
-                        [repr(n) for n in nodes_to_delete] + ["None"]
-                    )
-                    body.append(f";  {to_delete_str}\n")
+                    return f'{origin_typename}[{",".join(args)}]'
                 else:
-                    body.append("\n")
+                    # Bare type, such as `typing.Tuple` with no subscript
+                    # This code-path used in Python 3.9+
+                    return origin_typename
+
+            # Common case: this is a regular module name like 'foo.bar.baz'
+            return add_global(typename, o)
+
+        def _format_args(
+            args: Tuple[Argument, ...], kwargs: Dict[str, Argument]
+        ) -> str:
+            def _get_repr(arg):
+                # Handle NamedTuples (if it has `_fields`) via add_global.
+                if isinstance(arg, tuple) and hasattr(arg, "_fields"):
+                    qualified_name = _get_qualified_name(type(arg))
+                    global_name = add_global(qualified_name, type(arg))
+                    return f"{global_name}{repr(tuple(arg))}"
+                return repr(arg)
+
+            args_s = ", ".join(_get_repr(a) for a in args)
+            kwargs_s = ", ".join(f"{k} = {_get_repr(v)}" for k, v in kwargs.items())
+            if args_s and kwargs_s:
+                return f"{args_s}, {kwargs_s}"
+            return args_s or kwargs_s
+
+        # Run through reverse nodes and record the first instance of a use
+        # of a given node. This represents the *last* use of the node in the
+        # execution order of the program, which we will use to free unused
+        # values
+        node_to_last_use: Dict[Node, Node] = {}
+        user_to_last_uses: Dict[Node, List[Node]] = {}
+
+        def register_last_uses(n: Node, user: Node):
+            if n not in node_to_last_use:
+                node_to_last_use[n] = user
+                user_to_last_uses.setdefault(user, []).append(n)
+
+        for node in reversed(nodes):
+            map_arg(node.args, lambda n: register_last_uses(n, node))
+            map_arg(node.kwargs, lambda n: register_last_uses(n, node))
+
+        delete_free_var_from_last_use(user_to_last_uses)
+
+        # NOTE: we add a variable to distinguish body and ckpt_func
+        def delete_unused_values(user: Node, body, to_keep=[]):
+            """
+            Delete values after their last use. This ensures that values that are
+            not used in the remainder of the code are freed and the memory usage
+            of the code is optimal.
+            """
+            if user.op == "placeholder":
+                return
+            if user.op == "output":
+                body.append("\n")
+                return
+            nodes_to_delete = user_to_last_uses.get(user, [])
+            nodes_to_delete = [i for i in nodes_to_delete if i.name not in to_keep]
+            if len(nodes_to_delete):
+                to_delete_str = " = ".join(
+                    [repr(n) for n in nodes_to_delete] + ["None"]
+                )
+                body.append(f";  {to_delete_str}\n")
+            else:
+                body.append("\n")
 
-            # NOTE: we add a variable to distinguish body and ckpt_func
-            def emit_node(node: Node, body):
-                maybe_type_annotation = (
-                    "" if node.type is None else f" : {type_repr(node.type)}"
+        # NOTE: we add a variable to distinguish body and ckpt_func
+        def emit_node(node: Node, body):
+            maybe_type_annotation = (
+                "" if node.type is None else f" : {type_repr(node.type)}"
+            )
+            if node.op == "placeholder":
+                assert isinstance(node.target, str)
+                maybe_default_arg = "" if not node.args else f" = {repr(node.args[0])}"
+                free_vars.append(
+                    f"{node.target}{maybe_type_annotation}{maybe_default_arg}"
                 )
-                if node.op == "placeholder":
-                    assert isinstance(node.target, str)
-                    maybe_default_arg = (
-                        "" if not node.args else f" = {repr(node.args[0])}"
-                    )
-                    free_vars.append(
-                        f"{node.target}{maybe_type_annotation}{maybe_default_arg}"
-                    )
-                    raw_name = node.target.replace("*", "")
-                    if raw_name != repr(node):
-                        body.append(f"{repr(node)} = {raw_name}\n")
-                    return
-                elif node.op == "call_method":
-                    assert isinstance(node.target, str)
-                    body.append(
-                        f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}"
-                        f"({_format_args(node.args[1:], node.kwargs)})"
-                    )
-                    return
-                elif node.op == "call_function":
-                    assert callable(node.target)
-                    # pretty print operators
-                    if (
-                        node.target.__module__ == "_operator"
-                        and node.target.__name__ in magic_methods
-                    ):
-                        assert isinstance(node.args, tuple)
-                        body.append(
-                            f"{repr(node)}{maybe_type_annotation} = "
-                            f"{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}"
-                        )
-                        return
-
-                    # pretty print inplace operators; required for jit.script to work properly
-                    # not currently supported in normal FX graphs, but generated by torchdynamo
-                    if (
-                        node.target.__module__ == "_operator"
-                        and node.target.__name__ in inplace_methods
-                    ):
-                        body.append(
-                            f"{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  "
-                            f"{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}"
-                        )
-                        return
-
-                    qualified_name = _get_qualified_name(node.target)
-                    global_name = add_global(qualified_name, node.target)
-                    # special case for getattr: node.args could be 2-argument or 3-argument
-                    # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
-                    if (
-                        global_name == "getattr"
-                        and isinstance(node.args, tuple)
-                        and isinstance(node.args[1], str)
-                        and node.args[1].isidentifier()
-                        and len(node.args) == 2
-                    ):
-                        body.append(
-                            f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}"
-                        )
-                        return
+                raw_name = node.target.replace("*", "")
+                if raw_name != repr(node):
+                    body.append(f"{repr(node)} = {raw_name}\n")
+                return
+            elif node.op == "call_method":
+                assert isinstance(node.target, str)
+                body.append(
+                    f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}"
+                    f"({_format_args(node.args[1:], node.kwargs)})"
+                )
+                return
+            elif node.op == "call_function":
+                assert callable(node.target)
+                # pretty print operators
+                if (
+                    node.target.__module__ == "_operator"
+                    and node.target.__name__ in magic_methods
+                ):
+                    assert isinstance(node.args, tuple)
                     body.append(
-                        f"{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})"
+                        f"{repr(node)}{maybe_type_annotation} = "
+                        f"{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}"
                     )
-                    if node.meta.get("is_wrapped", False):
-                        wrapped_fns.setdefault(global_name)
                     return
-                elif node.op == "call_module":
-                    assert isinstance(node.target, str)
+
+                # pretty print inplace operators; required for jit.script to work properly
+                # not currently supported in normal FX graphs, but generated by torchdynamo
+                if (
+                    node.target.__module__ == "_operator"
+                    and node.target.__name__ in inplace_methods
+                ):
                     body.append(
-                        f"{repr(node)}{maybe_type_annotation} = "
-                        f"{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})"
+                        f"{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  "
+                        f"{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}"
                     )
                     return
-                elif node.op == "get_attr":
-                    assert isinstance(node.target, str)
+
+                qualified_name = _get_qualified_name(node.target)
+                global_name = add_global(qualified_name, node.target)
+                # special case for getattr: node.args could be 2-argument or 3-argument
+                # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
+                if (
+                    global_name == "getattr"
+                    and isinstance(node.args, tuple)
+                    and isinstance(node.args[1], str)
+                    and node.args[1].isidentifier()
+                    and len(node.args) == 2
+                ):
                     body.append(
-                        f"{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}"
+                        f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}"
                     )
                     return
-                elif node.op == "output":
-                    if node.type is not None:
-                        maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
-                    body.append(self.generate_output(node.args[0]))
-                    return
-                raise NotImplementedError(f"node: {node.op} {node.target}")
-
-            # Modified for activation checkpointing
-            ckpt_func = []
-
-            # if any node has a list of labels for activation_checkpoint, we
-            # will use nested type of activation checkpoint codegen
-            emit_code_with_chunk(
-                body,
-                nodes,
-                emit_node,
-                delete_unused_values,
-                self.search_chunk,
-                self.chunk_infos,
-            )
-
-            if len(body) == 0:
-                # If the Graph has no non-placeholder nodes, no lines for the body
-                # have been emitted. To continue to have valid Python code, emit a
-                # single pass statement
-                body.append("pass\n")
-
-            if len(wrapped_fns) > 0:
-                wrap_name = add_global("wrap", torch.fx.wrap)
-                wrap_stmts = "\n".join(
-                    [f'{wrap_name}("{name}")' for name in wrapped_fns]
+                body.append(
+                    f"{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})"
                 )
-            else:
-                wrap_stmts = ""
+                if node.meta.get("is_wrapped", False):
+                    wrapped_fns.setdefault(global_name)
+                return
+            elif node.op == "call_module":
+                assert isinstance(node.target, str)
+                body.append(
+                    f"{repr(node)}{maybe_type_annotation} = "
+                    f"{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})"
+                )
+                return
+            elif node.op == "get_attr":
+                assert isinstance(node.target, str)
+                body.append(
+                    f"{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}"
+                )
+                return
+            elif node.op == "output":
+                if node.type is not None:
+                    maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
+                body.append(self.generate_output(node.args[0]))
+                return
+            raise NotImplementedError(f"node: {node.op} {node.target}")
+
+        # Modified for activation checkpointing
+        ckpt_func = []
+
+        # if any node has a list of labels for activation_checkpoint, we
+        # will use nested type of activation checkpoint codegen
+        emit_code_with_chunk(
+            body,
+            nodes,
+            emit_node,
+            delete_unused_values,
+            self.search_chunk,
+            self.chunk_infos,
+        )
+
+        if len(body) == 0:
+            # If the Graph has no non-placeholder nodes, no lines for the body
+            # have been emitted. To continue to have valid Python code, emit a
+            # single pass statement
+            body.append("pass\n")
+
+        if len(wrapped_fns) > 0:
+            wrap_name = add_global("wrap", torch.fx.wrap)
+            wrap_stmts = "\n".join([f'{wrap_name}("{name}")' for name in wrapped_fns])
+        else:
+            wrap_stmts = ""
 
-            if self._body_transformer:
-                body = self._body_transformer(body)
+        if self._body_transformer:
+            body = self._body_transformer(body)
 
-            for name, value in self.additional_globals():
-                add_global(name, value)
+        for name, value in self.additional_globals():
+            add_global(name, value)
 
-            # as we need colossalai.utils.checkpoint, we need to import colossalai
-            # in forward function
-            prologue = self.gen_fn_def(free_vars, maybe_return_annotation[0])
-            prologue = "".join(ckpt_func) + prologue
-            prologue = prologue
+        # as we need colossalai.utils.checkpoint, we need to import colossalai
+        # in forward function
+        prologue = self.gen_fn_def(free_vars, maybe_return_annotation[0])
+        prologue = "".join(ckpt_func) + prologue
+        prologue = prologue
 
-            code = "".join(body)
-            code = "\n".join("    " + line for line in code.split("\n"))
-            fn_code = f"""
+        code = "".join(body)
+        code = "\n".join("    " + line for line in code.split("\n"))
+        fn_code = f"""
 {wrap_stmts}
 
 {prologue}
 {code}"""
-            # print(fn_code)
-            return PythonCode(fn_code, globals_)
+        # print(fn_code)
+        return PythonCode(fn_code, globals_)

From d3f5ce9efb35bf9e292aa041a3e98b737cbb68ee Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 9 Jan 2023 16:21:44 +0800
Subject: [PATCH 112/209] [workflow] added nightly release to pypi (#2403)

---
 .github/workflows/release_nightly.yml | 86 +++++++--------------------
 setup.py                              | 30 ++++++++--
 2 files changed, 45 insertions(+), 71 deletions(-)

diff --git a/.github/workflows/release_nightly.yml b/.github/workflows/release_nightly.yml
index 6bc000d1f4f6..8aa48b8ed89e 100644
--- a/.github/workflows/release_nightly.yml
+++ b/.github/workflows/release_nightly.yml
@@ -1,73 +1,29 @@
-name: Release bdist wheel for Nightly versions
+name: Publish Nightly Version to PyPI
 
 on:
-  schedule:
-    # run at 00:00 of every Sunday
-    - cron:  '0 0 * * 6'
   workflow_dispatch:
+  schedule:
+    - cron:  '0 0 * * 6' # release on every Sunday 00:00 UTC time
 
 jobs:
-  matrix_preparation:
-    name: Prepare Container List
+  build-n-publish:
+    if: github.event_name == 'workflow_dispatch' || github.repository == 'hpcaitech/ColossalAI'
+    name: Build and publish Python 🐍 distributions 📦 to PyPI
     runs-on: ubuntu-latest
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    timeout-minutes: 20
     steps:
-    - id: set-matrix
-      run: |
-        matrix="[\"hpcaitech/cuda-conda:11.3\", \"hpcaitech/cuda-conda:10.2\"]"
-        echo $matrix
-        echo "::set-output name=matrix::{\"container\":$(echo $matrix)}"
+    - uses: actions/checkout@v2
 
-  build:
-    name: Release bdist wheels
-    needs: matrix_preparation
-    if: github.repository == 'hpcaitech/ColossalAI' && contains(fromJson('["FrankLeeeee", "ver217", "feifeibear", "kurisusnowdeng"]'), github.actor)
-    runs-on: [self-hosted, gpu]
-    strategy:
-      fail-fast: false
-      matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
-    container:
-      image: ${{ matrix.container }}
-      options: --gpus all --rm
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          fetch-depth: 0
-      # cub is for cuda 10.2
-      - name: Copy scripts and checkout
-        run: |
-          cp -r ./.github/workflows/scripts/* ./
-          ln -s /github/home/pip_wheels ./pip_wheels
-          wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
-          unzip 1.8.0.zip
-      - name: Build bdist wheel
-        run: |
-          pip install beautifulsoup4 requests packaging
-          python ./build_colossalai_wheel.py --nightly
-      - name: 🚀 Deploy
-        uses: garygrossgarten/github-action-scp@release
-        with:
-          local: all_dist
-          remote: ${{ secrets.PRIVATE_PYPI_NIGHTLY_DIR }}
-          host: ${{ secrets.PRIVATE_PYPI_HOST }}
-          username: ${{ secrets.PRIVATE_PYPI_USER }}
-          password: ${{ secrets.PRIVATE_PYPI_PASSWD }}
-  remove_old_build:
-    name: Remove old nightly build
-    runs-on: ubuntu-latest
-    needs: build
-    steps:
-      - name: executing remote ssh commands using password
-        uses: appleboy/ssh-action@master
-        env:
-          BUILD_DIR: ${{ secrets.PRIVATE_PYPI_NIGHTLY_DIR }}
-        with:
-          host: ${{ secrets.PRIVATE_PYPI_HOST }}
-          username: ${{ secrets.PRIVATE_PYPI_USER }}
-          password: ${{ secrets.PRIVATE_PYPI_PASSWD }}
-          envs: BUILD_DIR
-          script: |
-            cd $BUILD_DIR
-            find . -type f -mtime +0 -exec rm -f {} +
-          script_stop: true
+    - uses: actions/setup-python@v2
+      with:
+        python-version: '3.8.14'
+
+    - run: NIGHTLY=1 python setup.py sdist build
+
+    # publish to PyPI if executed on the main branch
+    - name: Publish package to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        user: __token__
+        password: ${{ secrets.PYPI_API_TOKEN }}
+        verbose: true
diff --git a/setup.py b/setup.py
index 38d5fa91cecd..5128b80e880d 100644
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,6 @@
 import os
 import re
+from datetime import datetime
 
 from setuptools import find_packages, setup
 
@@ -20,18 +21,22 @@
     TORCH_AVAILABLE = False
     CUDA_HOME = None
 
-
 # ninja build does not work unless include_dirs are abs path
 this_dir = os.path.dirname(os.path.abspath(__file__))
 build_cuda_ext = False
 ext_modules = []
+is_nightly = int(os.environ.get('NIGHTLY', '0')) == 1
 
 if int(os.environ.get('CUDA_EXT', '0')) == 1:
     if not TORCH_AVAILABLE:
-        raise ModuleNotFoundError("PyTorch is not found while CUDA_EXT=1. You need to install PyTorch first in order to build CUDA extensions")
+        raise ModuleNotFoundError(
+            "PyTorch is not found while CUDA_EXT=1. You need to install PyTorch first in order to build CUDA extensions"
+        )
 
     if not CUDA_HOME:
-        raise RuntimeError("CUDA_HOME is not found while CUDA_EXT=1. You need to export CUDA_HOME environment vairable or install CUDA Toolkit first in order to build CUDA extensions")
+        raise RuntimeError(
+            "CUDA_HOME is not found while CUDA_EXT=1. You need to export CUDA_HOME environment vairable or install CUDA Toolkit first in order to build CUDA extensions"
+        )
 
     build_cuda_ext = True
 
@@ -139,8 +144,16 @@ def get_version():
         print(f'===== Building Extension {name} =====')
         ext_modules.append(builder_cls().builder())
 
-setup(name='colossalai',
-      version=get_version(),
+if is_nightly:
+    # use date as the nightly version
+    version = datetime.today().strftime('%Y.%m.%d')
+    package_name = 'colossalai-nightly'
+else:
+    version = get_version()
+    package_name = 'colossalai'
+
+setup(name=package_name,
+      version=version,
       packages=find_packages(exclude=(
           'benchmark',
           'docker',
@@ -179,4 +192,9 @@ def get_version():
           'Topic :: Scientific/Engineering :: Artificial Intelligence',
           'Topic :: System :: Distributed Computing',
       ],
-      package_data={'colossalai': ['_C/*.pyi', 'kernel/cuda_native/csrc/*', 'kernel/cuda_native/csrc/kernel/*', 'kernel/cuda_native/csrc/kernels/include/*']})
+      package_data={
+          'colossalai': [
+              '_C/*.pyi', 'kernel/cuda_native/csrc/*', 'kernel/cuda_native/csrc/kernel/*',
+              'kernel/cuda_native/csrc/kernels/include/*'
+          ]
+      })

From 212b5b1b5f4f3debf983d8c47c58af507a554be4 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 16:29:33 +0800
Subject: [PATCH 113/209] add comments

---
 colossalai/autochunk/autochunk_codegen.py     | 35 +++++++++++--------
 .../test_autochunk/test_autochunk_codegen.py  |  2 +-
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 9ec59477b426..5ef560ac209a 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -1,4 +1,4 @@
-from typing import Any, Callable, Dict, Iterable, List, Tuple
+from typing import Any, Dict, Iterable, List, Tuple
 
 import torch
 from torch.fx.graph import (
@@ -128,37 +128,42 @@ def _replace_input_var(chunk_inputs, region_idx, chunk_inputs_dim, node_idx, bod
 
 
 def emit_code_with_chunk(
-    body,
-    nodes,
+    body: List[str],
+    nodes: Iterable[Node],
     emit_node_func,
     delete_unused_value_func,
     search_chunk: SearchChunk,
-    chunk_infos,
+    chunk_infos: List,
 ):
-    """Emit code with nested activation checkpoint
-    When we detect some of the node.activation_checkpoint is a List, we will use
-    this function to emit the activation checkpoint codes.
+    """
+    Emit code with chunk according to chunk_infos.
+    
+    It will generate a for loop in chunk regions, and replace inputs 
+        and outputs of regions with chunked variables.
 
     Args:
         body: forward code
-        ckpt_func: checkpoint functions code
         nodes: graph.nodes
         emit_node_func: function to emit node
         delete_unused_value_func: function to remove the unused value
+        search_chunk: the class to search all chunks
+        chunk_infos: store all information about all chunks.
     """
     node_list = list(nodes)
 
-    chunk_regions = [i["region"] for i in chunk_infos]
-    chunk_starts = [i[0] for i in chunk_regions]
-    chunk_ends = [i[1] for i in chunk_regions]
+    # chunk region
+    chunk_starts = [i["region"][0] for i in chunk_infos]
+    chunk_ends = [i["region"][1] for i in chunk_infos]
 
-    chunk_inputs = [i["inputs"] for i in chunk_infos]
-    chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos]
-    chunk_inputs_dim = [i["inputs_dim"] for i in chunk_infos]
+    # chunk inputs
+    chunk_inputs = [i["inputs"] for i in chunk_infos] # input with chunk
+    chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos] # input without chunk
+    chunk_inputs_dim = [i["inputs_dim"] for i in chunk_infos] # input chunk dim
     chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [
         j.name for i in chunk_inputs_non_chunk for j in i
     ]
 
+    # chunk outputs
     chunk_outputs = [i["outputs"][0] for i in chunk_infos]
     chunk_outputs_dim = [i["outputs_dim"] for i in chunk_infos]
 
@@ -170,6 +175,7 @@ def emit_code_with_chunk(
     while node_idx < len(node_list):
         node = node_list[node_idx]
 
+        # if is chunk start, generate for loop start
         if node_idx in chunk_starts:
             within_chunk_region = True
             region_idx = chunk_starts.index(node_idx)
@@ -203,6 +209,7 @@ def emit_code_with_chunk(
             if node_idx not in chunk_inputs:
                 delete_unused_value_func(node, body, chunk_inputs_names)
 
+        # generate chunk region end
         if node_idx in chunk_ends:
             body.append(
                 _gen_loop_end(
diff --git a/tests/test_autochunk/test_autochunk_codegen.py b/tests/test_autochunk/test_autochunk_codegen.py
index c4f5cda67204..53f62077c07a 100644
--- a/tests/test_autochunk/test_autochunk_codegen.py
+++ b/tests/test_autochunk/test_autochunk_codegen.py
@@ -115,4 +115,4 @@ def test_autochunk_codegen(msa_len, pair_len, max_memory):
 
 
 if __name__ == "__main__":
-    _test_autochunk_codegen(0, 32, 64, None)
+    _test_autochunk_codegen(0, 32, 64, 25)

From 1951f7fa87725b6cc719226d26e5734958adffac Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 16:30:16 +0800
Subject: [PATCH 114/209] code style

---
 colossalai/autochunk/autochunk_codegen.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 5ef560ac209a..cc39e391e4be 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -137,9 +137,9 @@ def emit_code_with_chunk(
 ):
     """
     Emit code with chunk according to chunk_infos.
-    
-    It will generate a for loop in chunk regions, and replace inputs 
-        and outputs of regions with chunked variables.
+
+    It will generate a for loop in chunk regions, and 
+    replace inputs and outputs of regions with chunked variables.
 
     Args:
         body: forward code
@@ -156,9 +156,11 @@ def emit_code_with_chunk(
     chunk_ends = [i["region"][1] for i in chunk_infos]
 
     # chunk inputs
-    chunk_inputs = [i["inputs"] for i in chunk_infos] # input with chunk
-    chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos] # input without chunk
-    chunk_inputs_dim = [i["inputs_dim"] for i in chunk_infos] # input chunk dim
+    chunk_inputs = [i["inputs"] for i in chunk_infos]  # input with chunk
+    chunk_inputs_non_chunk = [
+        i["inputs_non_chunk"] for i in chunk_infos
+    ]  # input without chunk
+    chunk_inputs_dim = [i["inputs_dim"] for i in chunk_infos]  # input chunk dim
     chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [
         j.name for i in chunk_inputs_non_chunk for j in i
     ]

From a68d240ed56dcd62a0726621c50233f733e79367 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 16:54:08 +0800
Subject: [PATCH 115/209] add doc for search chunk

---
 colossalai/autochunk/search_chunk.py | 76 ++++++++++++++++++++--------
 1 file changed, 55 insertions(+), 21 deletions(-)

diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index 21b967497f1b..613c28454df3 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -1,4 +1,7 @@
 import copy
+from typing import Any, Dict, Iterable, List, Tuple
+
+from torch.fx.node import Node
 
 from .estimate_memory import EstimateMemory
 from .reorder_graph import ReorderGraph
@@ -13,6 +16,34 @@
 
 
 class SearchChunk(object):
+    """
+    This is the core class for AutoChunk.
+
+    It defines the framework of the strategy of AutoChunk.
+    Chunks will be selected one by one utill search stops.
+
+    The chunk search is as follows:
+    1. find the peak memory node
+    2. find the max chunk region according to the peak memory node
+    3. find all possible chunk regions in the max chunk region
+    4. find the best chunk region for current status
+    5. goto 1
+
+    Attributes:
+        gm: graph model
+        print_mem (bool): print estimated memory
+        trace_index: trace the flow of every dim of every node to find all free dims
+        trace_flow: determine the region chunk strategy
+        reorder_graph: reorder nodes to improve chunk efficiency
+        estimate_memory: estimate memory with chunk
+        select_chunk: select the best chunk region
+
+    Args:
+        gm: graph model
+        max_memory (int): max memory in MB
+        print_mem (bool): print estimated memory
+    """
+
     def __init__(self, gm, max_memory=None, print_mem=False) -> None:
         self.gm = gm
         self.print_mem = print_mem
@@ -33,24 +64,37 @@ def _find_peak_node(self, mem_peak):
         max_idx = mem_peak.index(max_value)
         return max_idx
 
-    def _get_free_var(self):
+    def _get_free_var_idx(self) -> List:
+        """
+        Get free var index
+
+        Returns:
+            free_var_idx (List): all indexs of free vars
+        """
         free_var_idx = []
         for idx, n in enumerate(self.trace_index.node_list):
             if n.op == "placeholder":
                 free_var_idx.append(idx)
         return free_var_idx
 
-    def _get_min_free_var(self, active_node_list, free_vars):
-        min_len = 999
-        for idx, n in enumerate(active_node_list):
-            if idx in free_vars:
-                continue
-            if len(n) < min_len:
-                min_len = len(n)
-        return min_len
+    def _search_max_chunk_region(
+        self, active_node: List, peak_node: Node, chunk_regions: List
+    ) -> Tuple:
+        """
+        Search max chunk region according to peak memory node
+
+        Chunk region starts extending from the peak node, stops where free var num is min
 
-    def _search_max_chunk_region(self, active_node, peak_node, chunk_regions):
-        free_vars = self._get_free_var()
+        Args:
+            active_node (List): active node status for every node
+            peak_node (Node): peak memory node
+            chunk_regions (List): chunk region info
+
+        Returns:
+            chunk_region_start (int)
+            chunk_region_end (int)
+        """
+        free_vars = self._get_free_var_idx()
         free_var_num = len(free_vars)
         active_node_num = [len(i) for i in active_node]
         min_active_node_num = min(active_node_num[free_var_num:])
@@ -92,16 +136,6 @@ def _search_max_chunk_region(self, active_node, peak_node, chunk_regions):
                 chunk_region_end = region[0] - 1
         return chunk_region_start, chunk_region_end
 
-    def _is_not_compute(self, trace, chunk_range, dim_idx):
-        if trace["idx"][dim_idx] not in trace["compute"]:
-            return True
-        if trace["idx"][dim_idx] in trace["compute"] and all(
-            i < chunk_range[0] or i > chunk_range[1]
-            for i in trace["compute"][trace["idx"][dim_idx]]
-        ):
-            return True
-        return False
-
     def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
         start_traces = input_trace[start_idx]
         end_trace = output_trace[end_idx]

From 85e045b063a70cd36ccc0405acc245d86f2a1621 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 9 Jan 2023 17:08:55 +0800
Subject: [PATCH 116/209] [doc] updated readme regarding pypi installation
 (#2406)

---
 README-zh-Hans.md | 46 ++++++++++++++++++++++++++++++++++------------
 README.md         | 28 ++++++++++++++++++++++++----
 2 files changed, 58 insertions(+), 16 deletions(-)

diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index 8edcff28bf04..b97b02f5ab84 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -5,10 +5,10 @@
 
    Colossal-AI: 一个面向大模型时代的通用深度学习系统
 
-   <h3> <a href="https://arxiv.org/abs/2110.14883"> 论文 </a> | 
-   <a href="https://www.colossalai.org/"> 文档 </a> | 
-   <a href="https://github.com/hpcaitech/ColossalAI-Examples"> 例程 </a> |   
-   <a href="https://github.com/hpcaitech/ColossalAI/discussions"> 论坛 </a> | 
+   <h3> <a href="https://arxiv.org/abs/2110.14883"> 论文 </a> |
+   <a href="https://www.colossalai.org/"> 文档 </a> |
+   <a href="https://github.com/hpcaitech/ColossalAI-Examples"> 例程 </a> |
+   <a href="https://github.com/hpcaitech/ColossalAI/discussions"> 论坛 </a> |
    <a href="https://medium.com/@hpcaitech"> 博客 </a></h3>
 
    [![Build](https://github.com/hpcaitech/ColossalAI/actions/workflows/build.yml/badge.svg)](https://github.com/hpcaitech/ColossalAI/actions/workflows/build.yml)
@@ -35,7 +35,7 @@
  <li><a href="#为何选择-Colossal-AI">为何选择 Colossal-AI</a> </li>
  <li><a href="#特点">特点</a> </li>
  <li>
-   <a href="#并行训练样例展示">并行训练样例展示</a> 
+   <a href="#并行训练样例展示">并行训练样例展示</a>
    <ul>
      <li><a href="#GPT-3">GPT-3</a></li>
      <li><a href="#GPT-2">GPT-2</a></li>
@@ -47,14 +47,14 @@
    </ul>
  </li>
 <li>
-   <a href="#单GPU训练样例展示">单GPU训练样例展示</a> 
+   <a href="#单GPU训练样例展示">单GPU训练样例展示</a>
    <ul>
      <li><a href="#GPT-2-Single">GPT-2</a></li>
      <li><a href="#PaLM-Single">PaLM</a></li>
    </ul>
  </li>
 <li>
-   <a href="#推理-Energon-AI-样例展示">推理 (Energon-AI) 样例展示</a> 
+   <a href="#推理-Energon-AI-样例展示">推理 (Energon-AI) 样例展示</a>
    <ul>
      <li><a href="#GPT-3-Inference">GPT-3</a></li>
      <li><a href="#OPT-Serving">1750亿参数OPT在线推理服务</a></li>
@@ -62,7 +62,7 @@
    </ul>
  </li>
 <li>
-   <a href="#Colossal-AI-in-the-Real-World">Colossal-AI 成功案例</a> 
+   <a href="#Colossal-AI-in-the-Real-World">Colossal-AI 成功案例</a>
    <ul>
      <li><a href="#AIGC">AIGC: 加速 Stable Diffusion</a></li>
      <li><a href="#生物医药">生物医药: 加速AlphaFold蛋白质结构预测</a></li>
@@ -131,7 +131,7 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/(updated)GPT-2.png" width=800>
 
 - 用相同的硬件训练24倍大的模型
-- 超3倍的吞吐量 
+- 超3倍的吞吐量
 
 ### BERT
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/BERT.png" width=800/>
@@ -145,7 +145,7 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/OPT_update.png" width=800/>
 
 - [Open Pretrained Transformer (OPT)](https://github.com/facebookresearch/metaseq), 由Meta发布的1750亿语言模型，由于完全公开了预训练参数权重，因此促进了下游任务和应用部署的发展。
-- 加速45%，仅用几行代码以低成本微调OPT。[[样例]](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/opt) [[在线推理]](https://service.colossalai.org/opt) 
+- 加速45%，仅用几行代码以低成本微调OPT。[[样例]](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/opt) [[在线推理]](https://service.colossalai.org/opt)
 
 请访问我们的 [文档](https://www.colossalai.org/) 和 [例程](https://github.com/hpcaitech/ColossalAI-Examples) 以了解详情。
 
@@ -255,6 +255,28 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 
 ## 安装
 
+### 从PyPI安装
+
+您可以用下面的命令直接从PyPI上下载并安装Colossal-AI。我们默认不会安装PyTorch扩展包
+
+```bash
+pip install colossalai
+```
+
+但是，如果你想在安装时就直接构建PyTorch扩展，您可以设置环境变量`CUDA_EXT=1`.
+
+```bash
+CUDA_EXT=1 pip install colossalai
+```
+
+**否则，PyTorch扩展只会在你实际需要使用他们时在运行时里被构建。**
+
+与此同时，我们也每周定时发布Nightly版本，这能让你提前体验到新的feature和bug fix。你可以通过以下命令安装Nightly版本。
+
+```bash
+pip install colossalai-nightly
+```
+
 ### 从官方安装
 
 您可以访问我们[下载](https://www.colossalai.org/download)页面来安装Colossal-AI，在这个页面上发布的版本都预编译了CUDA扩展。
@@ -274,10 +296,10 @@ pip install -r requirements/requirements.txt
 pip install .
 ```
 
-如果您不想安装和启用 CUDA 内核融合（使用融合优化器时强制安装）：
+我们默认在`pip install`时不安装PyTorch扩展，而是在运行时临时编译，如果你想要提前安装这些扩展的话（在使用融合优化器时会用到），可以使用一下命令。
 
 ```shell
-NO_CUDA_EXT=1 pip install .
+CUDA_EXT=1 pip install .
 ```
 
 <p align="right">(<a href="#top">返回顶端</a>)</p>
diff --git a/README.md b/README.md
index 1b0ca7e973e0..7aba907e0a64 100644
--- a/README.md
+++ b/README.md
@@ -257,9 +257,32 @@ Acceleration of [AlphaFold Protein Structure](https://alphafold.ebi.ac.uk/)
 
 ## Installation
 
+### Install from PyPI
+
+You can easily install Colossal-AI with the following command. **By defualt, we do not build PyTorch extensions during installation.**
+
+```bash
+pip install colossalai
+```
+
+However, if you want to build the PyTorch extensions during installation, you can set `CUDA_EXT=1`.
+
+```bash
+CUDA_EXT=1 pip install colossalai
+```
+
+**Otherwise, CUDA kernels will be built during runtime when you actually need it.**
+
+We also keep release the nightly version to PyPI on a weekly basis. This allows you to access the unreleased features and bug fixes in the main branch.
+Installation can be made via
+
+```bash
+pip install colossalai-nightly
+```
+
 ### Download From Official Releases
 
-You can visit the [Download](https://www.colossalai.org/download) page to download Colossal-AI with pre-built CUDA extensions.
+You can visit the [Download](https://www.colossalai.org/download) page to download Colossal-AI with pre-built PyTorch extensions.
 
 
 ### Download From Source
@@ -270,9 +293,6 @@ You can visit the [Download](https://www.colossalai.org/download) page to downlo
 git clone https://github.com/hpcaitech/ColossalAI.git
 cd ColossalAI
 
-# install dependency
-pip install -r requirements/requirements.txt
-
 # install colossalai
 pip install .
 ```

From 065f0b4c27316e8bc022dbb16d929194b5bb3445 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 17:11:51 +0800
Subject: [PATCH 117/209] add doc for search

---
 colossalai/autochunk/search_chunk.py | 76 +++++++++++++++++++++++++---
 1 file changed, 68 insertions(+), 8 deletions(-)

diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index 613c28454df3..ff4c1587849e 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -1,5 +1,5 @@
 import copy
-from typing import Any, Dict, Iterable, List, Tuple
+from typing import Dict, List, Tuple
 
 from torch.fx.node import Node
 
@@ -136,7 +136,24 @@ def _search_max_chunk_region(
                 chunk_region_end = region[0] - 1
         return chunk_region_start, chunk_region_end
 
-    def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
+    def _find_chunk_info(self, input_trace, output_trace, start_idx, end_idx) -> List:
+        """
+        Find chunk info for a region.
+
+        We are given the region start and region end, and need to find out all chunk info for it.
+        We first loop every dim of start node and end node, to see if we can find dim pair,
+        which is linked in a flow and not computed.
+        If found, we then search flow in the whole region to find out all chunk infos.
+
+        Args:
+            input_trace (List): node's input trace in region
+            output_trace (List): node's output trace in region
+            start_idx (int): region start node index
+            end_idx (int): region end node index
+
+        Returns:
+            chunk_infos: possible regions found
+        """
         start_traces = input_trace[start_idx]
         end_trace = output_trace[end_idx]
         end_node = self.trace_index.node_list[end_idx]
@@ -174,7 +191,19 @@ def _find_free_dim(self, input_trace, output_trace, start_idx, end_idx):
                     chunk_infos.append(chunk_info)
         return chunk_infos
 
-    def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
+    def _search_possible_chunk_regions(
+        self, max_chunk_region: Tuple, peak_node: Node
+    ) -> List:
+        """
+        Search every possible region within the max chunk region.
+
+        Args:
+            max_chunk_region (Tuple)
+            peak_node (Node): peak memory node
+
+        Returns:
+            possible_chunk_region (List)
+        """
         possible_chunk_region = []
         output_trace = copy.deepcopy(self.trace_index.idx_trace_list)
         input_trace = []  # trace of a node's input nodes
@@ -196,17 +225,39 @@ def _search_possible_chunk_regions(self, max_chunk_region, peak_node):
                     continue
 
                 # select free dim
-                chunk_info = self._find_free_dim(
+                chunk_info = self._find_chunk_info(
                     input_trace, output_trace, start_idx, end_idx
                 )
                 if len(chunk_info) > 0:
                     possible_chunk_region.extend(chunk_info)
         return possible_chunk_region
 
-    def _step_search(self, mem_peak, active_node, chunk_regions):
+    def _step_search(
+        self,
+        mem_peak: List[float],
+        active_node: List[List[Node]],
+        chunk_infos: List[Dict],
+    ) -> Dict:
+        """
+        Find one chunk region
+
+        The chunk search is as follows:
+        1. find the peak memory node
+        2. find the max chunk region according to the peak memory node
+        3. find all possible chunk regions in the max chunk region
+        4. find the best chunk region for current status
+
+        Args:
+            mem_peak (List): peak memory for every node
+            active_node (List[List[Node]]): active node for every node
+            chunk_infos (List[Dict]): all chunk info
+
+        Returns:
+            best_chunk_region (Dict)
+        """
         peak_node = self._find_peak_node(mem_peak)
         max_chunk_region = self._search_max_chunk_region(
-            active_node, peak_node, chunk_regions
+            active_node, peak_node, chunk_infos
         )
         if max_chunk_region == None:
             return None
@@ -214,7 +265,7 @@ def _step_search(self, mem_peak, active_node, chunk_regions):
             max_chunk_region, peak_node
         )
         best_chunk_region = self.select_chunk._select_best_chunk_region(
-            possible_chunk_regions, chunk_regions, peak_node, max_chunk_region, mem_peak
+            possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
         )
         best_chunk_region = self.reorder_graph.reorder_all(best_chunk_region)
         return best_chunk_region
@@ -225,7 +276,16 @@ def _stop_search(self, init_mem_peak, mem_peak):
             return True
         return False
 
-    def search_region(self):
+    def search_region(self) -> Dict:
+        """
+        Search all chunk regions:
+        1. Estimate current memory
+        2. Find best chunk for current memory
+        3. goto 1
+
+        Returns:
+            chunk_infos (Dict)
+        """
         chunk_infos = []
         (
             init_mem_peak,

From 551cafec14477f17da38d671106341cdc8fed5ff Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 9 Jan 2023 17:13:53 +0800
Subject: [PATCH 118/209] [doc] updated kernel-related optimisers' docstring
 (#2385)

* [doc] updated kernel-related optimisers' docstring

* polish doc
---
 colossalai/nn/optimizer/cpu_adam.py    | 2 +-
 colossalai/nn/optimizer/fused_adam.py  | 3 +--
 colossalai/nn/optimizer/fused_lamb.py  | 3 +--
 colossalai/nn/optimizer/fused_sgd.py   | 3 +--
 colossalai/nn/optimizer/hybrid_adam.py | 2 +-
 5 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/colossalai/nn/optimizer/cpu_adam.py b/colossalai/nn/optimizer/cpu_adam.py
index a8c3522793d8..54036973e1e3 100644
--- a/colossalai/nn/optimizer/cpu_adam.py
+++ b/colossalai/nn/optimizer/cpu_adam.py
@@ -19,7 +19,7 @@ class CPUAdam(NVMeOptimizer):
       * Parameters on GPU and gradients on GPU is allowed.
       * Parameters on GPU and gradients on CPU is **not** allowed.
 
-    Requires ColossalAI to be installed via ``pip install .``.
+    `CPUAdam` requires CUDA extensions which can be built during installation or runtime.
 
     This version of CPU Adam accelates parameters updating on CPU with SIMD.
     Support of AVX2 or AVX512 is required.
diff --git a/colossalai/nn/optimizer/fused_adam.py b/colossalai/nn/optimizer/fused_adam.py
index 2f6bde5ca1ab..941866d557ff 100644
--- a/colossalai/nn/optimizer/fused_adam.py
+++ b/colossalai/nn/optimizer/fused_adam.py
@@ -9,8 +9,7 @@
 class FusedAdam(torch.optim.Optimizer):
     """Implements Adam algorithm.
 
-    Currently GPU-only.  Requires ColossalAI to be installed via
-    ``pip install .``.
+    `FusedAdam` requires CUDA extensions which can be built during installation or runtime.
 
     This version of fused Adam implements 2 fusions.
 
diff --git a/colossalai/nn/optimizer/fused_lamb.py b/colossalai/nn/optimizer/fused_lamb.py
index 891a76da73dd..72520064e98b 100644
--- a/colossalai/nn/optimizer/fused_lamb.py
+++ b/colossalai/nn/optimizer/fused_lamb.py
@@ -9,8 +9,7 @@
 class FusedLAMB(torch.optim.Optimizer):
     """Implements LAMB algorithm.
 
-    Currently GPU-only.  Requires ColossalAI to be installed via
-    ``pip install .``.
+    `FusedLAMB` requires CUDA extensions which can be built during installation or runtime.
 
     This version of fused LAMB implements 2 fusions.
 
diff --git a/colossalai/nn/optimizer/fused_sgd.py b/colossalai/nn/optimizer/fused_sgd.py
index 41e6d524895a..468713b223c1 100644
--- a/colossalai/nn/optimizer/fused_sgd.py
+++ b/colossalai/nn/optimizer/fused_sgd.py
@@ -10,8 +10,7 @@
 class FusedSGD(Optimizer):
     r"""Implements stochastic gradient descent (optionally with momentum).
 
-    Currently GPU-only.  Requires ColossalAI to be installed via
-    ``pip install .``.
+    `FusedSGD` requires CUDA extensions which can be built during installation or runtime.
 
     This version of fused SGD implements 2 fusions.
 
diff --git a/colossalai/nn/optimizer/hybrid_adam.py b/colossalai/nn/optimizer/hybrid_adam.py
index 5196d4338441..1d0fb92de499 100644
--- a/colossalai/nn/optimizer/hybrid_adam.py
+++ b/colossalai/nn/optimizer/hybrid_adam.py
@@ -19,7 +19,7 @@ class HybridAdam(NVMeOptimizer):
       * Parameters on GPU and gradients on GPU is allowed.
       * Parameters on GPU and gradients on CPU is **not** allowed.
 
-    Requires ColossalAI to be installed via ``pip install .``
+    `HybriadAdam` requires CUDA extensions which can be built during installation or runtime.
 
     This version of Hybrid Adam is an hybrid of CPUAdam and FusedAdam.
 

From 0ea903b94edb59df8e24ed86764197292f6345c5 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 17:25:13 +0800
Subject: [PATCH 119/209] rename trace_index to trace_indice

---
 colossalai/autochunk/autochunk_codegen.py     |  4 +-
 colossalai/autochunk/reorder_graph.py         | 32 +++++------
 colossalai/autochunk/search_chunk.py          | 32 +++++------
 colossalai/autochunk/select_chunk.py          | 22 ++++----
 colossalai/autochunk/trace_flow.py            | 56 +++++++++----------
 .../{trace_index.py => trace_indice.py}       |  2 +-
 6 files changed, 74 insertions(+), 74 deletions(-)
 rename colossalai/autochunk/{trace_index.py => trace_indice.py} (99%)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index cc39e391e4be..6e0cfb9cb2e7 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -94,9 +94,9 @@ def _replace_reshape_size(context, node_name, reshape_size_dict):
     return context
 
 
-def _replace_ones_like(search_chunk, chunk_infos, region_idx, node_idx, node, body):
+def _replace_ones_like(search_chunk: SearchChunk, chunk_infos, region_idx, node_idx, node, body):
     if "ones_like" in node.name:
-        meta_node = search_chunk.trace_index.node_list[node_idx]
+        meta_node = search_chunk.trace_indice.node_list[node_idx]
         chunk_dim = chunk_infos[region_idx]["node_chunk_dim"][meta_node]["chunk_dim"]
         if get_node_shape(meta_node)[chunk_dim] != 1:
             source_node = meta_node.args[0].args[0]
diff --git a/colossalai/autochunk/reorder_graph.py b/colossalai/autochunk/reorder_graph.py
index bf4420eac7ee..6baa0d2a7d13 100644
--- a/colossalai/autochunk/reorder_graph.py
+++ b/colossalai/autochunk/reorder_graph.py
@@ -1,22 +1,22 @@
-from .trace_index import TraceIndex
+from .trace_indice import TraceIndice
 from .utils import find_idx_by_name
 
 
 class ReorderGraph(object):
-    def __init__(self, trace_index: TraceIndex) -> None:
-        self.trace_index = trace_index
+    def __init__(self, trace_indice: TraceIndice) -> None:
+        self.trace_indice = trace_indice
         self.all_reorder_map = {
-            i: i for i in range(len(self.trace_index.idx_trace_list))
+            i: i for i in range(len(self.trace_indice.idx_trace_list))
         }
 
     def _get_reorder_map(self, chunk_info):
-        reorder_map = {i: i for i in range(len(self.trace_index.node_list))}
+        reorder_map = {i: i for i in range(len(self.trace_indice.node_list))}
 
         chunk_region_start = chunk_info["region"][0]
         chunk_region_end = chunk_info["region"][1]
         chunk_prepose_nodes = chunk_info["args"]["prepose_nodes"]
         chunk_prepose_nodes_idx = [
-            find_idx_by_name(i.name, self.trace_index.node_list)
+            find_idx_by_name(i.name, self.trace_indice.node_list)
             for i in chunk_prepose_nodes
         ]
         # put prepose nodes ahead
@@ -24,10 +24,10 @@ def _get_reorder_map(self, chunk_info):
             n_idx = chunk_prepose_nodes_idx[idx]
             reorder_map[n_idx] = chunk_region_start + idx
         # put other nodes after prepose nodes
-        for n in self.trace_index.node_list[chunk_region_start : chunk_region_end + 1]:
+        for n in self.trace_indice.node_list[chunk_region_start : chunk_region_end + 1]:
             if n in chunk_prepose_nodes:
                 continue
-            n_idx = find_idx_by_name(n.name, self.trace_index.node_list)
+            n_idx = find_idx_by_name(n.name, self.trace_indice.node_list)
             pos = sum([n_idx < i for i in chunk_prepose_nodes_idx])
             reorder_map[n_idx] = n_idx + pos
 
@@ -53,25 +53,25 @@ def _update_all_reorder_map(self, reorder_map):
             self.all_reorder_map[origin_idx] = reorder_map[map_idx]
 
     def _reorder_self_node_list(self, reorder_map):
-        new_node_list = [None for _ in range(len(self.trace_index.node_list))]
+        new_node_list = [None for _ in range(len(self.trace_indice.node_list))]
         for old_idx, new_idx in reorder_map.items():
-            new_node_list[new_idx] = self.trace_index.node_list[old_idx]
-        self.trace_index.node_list = new_node_list
+            new_node_list[new_idx] = self.trace_indice.node_list[old_idx]
+        self.trace_indice.node_list = new_node_list
 
     def _reorder_idx_trace(self, reorder_map):
         # reorder list
-        new_idx_trace_list = [None for _ in range(len(self.trace_index.idx_trace_list))]
+        new_idx_trace_list = [None for _ in range(len(self.trace_indice.idx_trace_list))]
         for old_idx, new_idx in reorder_map.items():
-            new_idx_trace_list[new_idx] = self.trace_index.idx_trace_list[old_idx]
-        self.trace_index.idx_trace_list = new_idx_trace_list
+            new_idx_trace_list[new_idx] = self.trace_indice.idx_trace_list[old_idx]
+        self.trace_indice.idx_trace_list = new_idx_trace_list
         # update compute
-        for idx_trace in self.trace_index.idx_trace_list:
+        for idx_trace in self.trace_indice.idx_trace_list:
             compute = idx_trace["compute"]
             for dim_compute in compute:
                 for idx, i in enumerate(dim_compute):
                     dim_compute[idx] = reorder_map[i]
         # update source
-        for idx_trace in self.trace_index.idx_trace_list:
+        for idx_trace in self.trace_indice.idx_trace_list:
             source = idx_trace["source"]
             for dim_idx, dim_source in enumerate(source):
                 new_dim_source = {}
diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index ff4c1587849e..d90e50927110 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -7,7 +7,7 @@
 from .reorder_graph import ReorderGraph
 from .select_chunk import SelectChunk
 from .trace_flow import TraceFlow
-from .trace_index import TraceIndex
+from .trace_indice import TraceIndice
 from .utils import (
     get_node_shape,
     is_non_compute_node,
@@ -47,13 +47,13 @@ class SearchChunk(object):
     def __init__(self, gm, max_memory=None, print_mem=False) -> None:
         self.gm = gm
         self.print_mem = print_mem
-        self.trace_index = TraceIndex(list(gm.graph.nodes))
-        self.trace_index.trace_index()
-        self.trace_flow = TraceFlow(self.trace_index)
-        self.reorder_graph = ReorderGraph(self.trace_index)
+        self.trace_indice = TraceIndice(list(gm.graph.nodes))
+        self.trace_indice.trace_index()
+        self.trace_flow = TraceFlow(self.trace_indice)
+        self.reorder_graph = ReorderGraph(self.trace_indice)
         self.estimate_memory = EstimateMemory()
         self.select_chunk = SelectChunk(
-            self.trace_index,
+            self.trace_indice,
             self.estimate_memory,
             self.reorder_graph,
             max_memory=max_memory,
@@ -72,7 +72,7 @@ def _get_free_var_idx(self) -> List:
             free_var_idx (List): all indexs of free vars
         """
         free_var_idx = []
-        for idx, n in enumerate(self.trace_index.node_list):
+        for idx, n in enumerate(self.trace_indice.node_list):
             if n.op == "placeholder":
                 free_var_idx.append(idx)
         return free_var_idx
@@ -156,7 +156,7 @@ def _find_chunk_info(self, input_trace, output_trace, start_idx, end_idx) -> Lis
         """
         start_traces = input_trace[start_idx]
         end_trace = output_trace[end_idx]
-        end_node = self.trace_index.node_list[end_idx]
+        end_node = self.trace_indice.node_list[end_idx]
         chunk_infos = []
         for end_dim, _ in enumerate(end_trace["idx"]):
             if len(start_traces) > 1:
@@ -205,23 +205,23 @@ def _search_possible_chunk_regions(
             possible_chunk_region (List)
         """
         possible_chunk_region = []
-        output_trace = copy.deepcopy(self.trace_index.idx_trace_list)
+        output_trace = copy.deepcopy(self.trace_indice.idx_trace_list)
         input_trace = []  # trace of a node's input nodes
-        for _, n in enumerate(self.trace_index.node_list):
+        for _, n in enumerate(self.trace_indice.node_list):
             cur_trace = {}
             for arg in n.args:
                 if type(arg) == type(n) and not is_non_compute_node_except_placeholder(
                     arg
                 ):
-                    cur_trace[arg] = self.trace_index._find_trace_from_node(arg)
+                    cur_trace[arg] = self.trace_indice._find_trace_from_node(arg)
             input_trace.append(cur_trace)
 
         for start_idx in range(max_chunk_region[0], peak_node + 1):
             for end_idx in range(peak_node, max_chunk_region[1] + 1):
                 # skip non compute nodes
                 if is_non_compute_node(
-                    self.trace_index.node_list[start_idx]
-                ) or is_non_compute_node(self.trace_index.node_list[end_idx]):
+                    self.trace_indice.node_list[start_idx]
+                ) or is_non_compute_node(self.trace_indice.node_list[end_idx]):
                     continue
 
                 # select free dim
@@ -292,7 +292,7 @@ def search_region(self) -> Dict:
             _,
             active_node,
         ) = self.estimate_memory.estimate_chunk_inference_mem(
-            self.trace_index.node_list
+            self.trace_indice.node_list
         )
         mem_peak = init_mem_peak
 
@@ -307,13 +307,13 @@ def search_region(self) -> Dict:
                 _,
                 active_node,
             ) = self.estimate_memory.estimate_chunk_inference_mem(
-                self.trace_index.node_list, chunk_infos
+                self.trace_indice.node_list, chunk_infos
             )
             if self._stop_search(init_mem_peak, mem_peak):
                 break
         if self.print_mem:
             self.print_mem = False
             self.estimate_memory.estimate_chunk_inference_mem(
-                self.trace_index.node_list, chunk_infos, print_mem=True
+                self.trace_indice.node_list, chunk_infos, print_mem=True
             )
         return chunk_infos
diff --git a/colossalai/autochunk/select_chunk.py b/colossalai/autochunk/select_chunk.py
index 7127cfd64e69..f0612e45a8e6 100644
--- a/colossalai/autochunk/select_chunk.py
+++ b/colossalai/autochunk/select_chunk.py
@@ -1,19 +1,19 @@
 from .estimate_memory import EstimateMemory
 from .reorder_graph import ReorderGraph
-from .trace_index import TraceIndex
+from .trace_indice import TraceIndice
 from .utils import is_non_compute_node
 
 
 class SelectChunk(object):
     def __init__(
         self,
-        trace_index: TraceIndex,
+        trace_indice: TraceIndice,
         estimate_memory: EstimateMemory,
         reorder_graph: ReorderGraph,
         max_memory=None,
     ):
-        self.index_tracer = trace_index
-        self.memory_estimator = estimate_memory
+        self.trace_indice = trace_indice
+        self.estimate_memory = estimate_memory
         self.reorder_graph = reorder_graph
         if max_memory is not None:
             self.stratge = "fit_memory"
@@ -68,10 +68,10 @@ def _select_fit_memory_chunk_region(
         for region in possible_chunk_regions:
             cur_region = region.copy()
             cur_node_list, cur_region = self.reorder_graph.tmp_reorder(
-                self.index_tracer.node_list, cur_region
+                self.trace_indice.node_list, cur_region
             )
             cur_chunk_infos = chunk_infos + [cur_region]
-            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
+            cur_mem_peak = self.estimate_memory.estimate_chunk_inference_mem(
                 cur_node_list, cur_chunk_infos
             )[0]
             cur_chunk_region_peak = cur_mem_peak[
@@ -113,7 +113,7 @@ def _get_fit_chunk_size(self, chunk_region_dict, chunk_infos):
             chunk_size *= 2
             reorder_chunk_info["chunk_size"] = chunk_size
             cur_chunk_infos = chunk_infos + [reorder_chunk_info]
-            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
+            cur_mem_peak = self.estimate_memory.estimate_chunk_inference_mem(
                 chunk_region_dict["reorder_node_list"], cur_chunk_infos
             )[0]
             cur_chunk_max_mem = max(
@@ -139,7 +139,7 @@ def _chunk_size_binary_search(self, left, right, chunk_region_dict, chunk_infos)
             mid = int((left + right) / 2 + 0.5)
             chunk_info["chunk_size"] = mid
             cur_chunk_infos = chunk_infos + [chunk_info]
-            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
+            cur_mem_peak = self.estimate_memory.estimate_chunk_inference_mem(
                 chunk_region_dict["reorder_node_list"], cur_chunk_infos
             )[0]
             cur_chunk_max_mem = max(
@@ -153,7 +153,7 @@ def _chunk_size_binary_search(self, left, right, chunk_region_dict, chunk_infos)
 
     def _get_compute_node_num(self, start, end):
         count = 0
-        for i in self.index_tracer.node_list[start : end + 1]:
+        for i in self.trace_indice.node_list[start : end + 1]:
             if not is_non_compute_node(i):
                 count += 1
         return count
@@ -178,10 +178,10 @@ def _select_min_memory_chunk_region(
         for region in possible_chunk_regions:
             cur_region = region.copy()
             cur_node_list, cur_region = self.reorder_graph.tmp_reorder(
-                self.index_tracer.node_list, cur_region
+                self.trace_indice.node_list, cur_region
             )
             cur_chunk_infos = chunk_infos + [cur_region]
-            cur_mem_peak = self.memory_estimator.estimate_chunk_inference_mem(
+            cur_mem_peak = self.estimate_memory.estimate_chunk_inference_mem(
                 cur_node_list, cur_chunk_infos
             )[0]
             cur_chunk_region_peak = cur_mem_peak[
diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py
index 7139e7e047ef..33fade1a5463 100644
--- a/colossalai/autochunk/trace_flow.py
+++ b/colossalai/autochunk/trace_flow.py
@@ -1,4 +1,4 @@
-from .trace_index import TraceIndex
+from .trace_indice import TraceIndice
 from .utils import (
     find_chunk_all_input_nodes,
     find_chunk_compute_input_and_output_nodes,
@@ -10,8 +10,8 @@
 
 
 class TraceFlow(object):
-    def __init__(self, trace_index: TraceIndex) -> None:
-        self.trace_index = trace_index
+    def __init__(self, trace_indice: TraceIndice) -> None:
+        self.trace_indice = trace_indice
 
     def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node):
         """
@@ -25,8 +25,8 @@ def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node
         Returns:
             bool: True if check pass
         """
-        start_node_idx = find_idx_by_name(start_node.name, self.trace_index.node_list)
-        end_node_trace = self.trace_index._find_trace_from_node(end_node)
+        start_node_idx = find_idx_by_name(start_node.name, self.trace_indice.node_list)
+        end_node_trace = self.trace_indice._find_trace_from_node(end_node)
         end_node_trace_source = end_node_trace["source"][end_dim]
         sorted_source = sorted(
             end_node_trace_source.items(), key=lambda d: d[0], reverse=True
@@ -51,24 +51,24 @@ def check_index_compute(self, start_idx, end_dim, end_node, end_idx):
         Returns:
             bool: True if check pass
         """
-        end_node_trace = self.trace_index._find_trace_from_node(end_node)
+        end_node_trace = self.trace_indice._find_trace_from_node(end_node)
         end_node_compute = end_node_trace["compute"][end_dim]
         if any(start_idx <= i <= end_idx for i in end_node_compute):
             return False
         return True
 
     def get_node_chunk_dim(self, node_from, node_from_dim, node_to):
-        node_from_source = self.trace_index._find_source_trace_from_node(node_from)
+        node_from_source = self.trace_indice._find_source_trace_from_node(node_from)
         dim_source = node_from_source[node_from_dim]
-        node_to_idx = find_idx_by_name(node_to.name, self.trace_index.node_list)
+        node_to_idx = find_idx_by_name(node_to.name, self.trace_indice.node_list)
         for k, v in dim_source.items():
             if k == node_to_idx:
                 return v
         return None
 
     def _find_inherit_dim(self, input_node, input_dim, node):
-        input_node_idx = find_idx_by_name(input_node.name, self.trace_index.node_list)
-        node_trace_source = self.trace_index._find_source_trace_from_node(node)
+        input_node_idx = find_idx_by_name(input_node.name, self.trace_indice.node_list)
+        node_trace_source = self.trace_indice._find_source_trace_from_node(node)
         for node_dim in range(len(get_node_shape(node))):
             if (
                 input_node_idx in node_trace_source[node_dim]
@@ -82,19 +82,19 @@ def check_index_duplicate(self, chunk_infos, return_dim=False):
         for input_node_idx, input_node in enumerate(chunk_infos["inputs"]):
             for k, v in chunk_infos["inputs_dim"][input_node_idx].items():
                 inherit_dim = self._find_inherit_dim(
-                    input_node, v, self.trace_index.node_list[k]
+                    input_node, v, self.trace_indice.node_list[k]
                 )
                 if inherit_dim:
                     input_dim_after_node[k] = inherit_dim
 
-        for node in self.trace_index.node_list[
+        for node in self.trace_indice.node_list[
             chunk_infos["region"][0] : chunk_infos["region"][1] + 1
         ]:
             if is_non_compute_node_except_placeholder(node):
                 continue
             count = 0
             duplicate_dims = []
-            node_trace_source = self.trace_index._find_source_trace_from_node(node)
+            node_trace_source = self.trace_indice._find_source_trace_from_node(node)
             for node_dim in range(len(get_node_shape(node))):
                 duplicate_dim = []
                 duplicate_flag = False
@@ -130,7 +130,7 @@ def _assgin_single_node_flow(
         all_node_info,
         next_node_list,
     ):
-        arg_idx = find_idx_by_name(arg_node.name, self.trace_index.node_list)
+        arg_idx = find_idx_by_name(arg_node.name, self.trace_indice.node_list)
         # arg in chunk range or be inputs
         if not (start_idx <= arg_idx < end_idx):
             return True
@@ -171,7 +171,7 @@ def _assgin_single_node_flow(
 
     def _get_all_node_info(self, end_dim, start_idx, end_idx):
         cur_node_list = [
-            self.trace_index.node_list[end_idx]
+            self.trace_indice.node_list[end_idx]
         ]  # start from the last node
         all_node_info = {cur_node_list[0]: {"chunk_dim": end_dim, "fix_dim": []}}
 
@@ -183,10 +183,10 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx):
                 cur_node_chunk_dim = all_node_info[cur_node]["chunk_dim"]
                 cur_node_fix_dim = all_node_info[cur_node]["fix_dim"]
                 if cur_node_chunk_dim:
-                    cur_node_compute = self.trace_index._find_compute_trace_from_node(
+                    cur_node_compute = self.trace_indice._find_compute_trace_from_node(
                         cur_node
                     )
-                    cur_node_source = self.trace_index._find_source_trace_from_node(
+                    cur_node_source = self.trace_indice._find_source_trace_from_node(
                         cur_node
                     )
                 else:
@@ -220,7 +220,7 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx):
                             if not (
                                 start_idx
                                 <= find_idx_by_name(
-                                    arg.name, self.trace_index.node_list
+                                    arg.name, self.trace_indice.node_list
                                 )
                                 < end_idx
                             ):
@@ -250,16 +250,16 @@ def _get_input_nodes_dim(self, inputs, start_idx, end_idx, all_node_info):
         for input_node in inputs:
             input_dict = {}
             input_node_idx = find_idx_by_name(
-                input_node.name, self.trace_index.node_list
+                input_node.name, self.trace_indice.node_list
             )
             for user in input_node.users.keys():
                 if is_non_compute_node(user):
                     continue
-                user_idx = find_idx_by_name(user.name, self.trace_index.node_list)
+                user_idx = find_idx_by_name(user.name, self.trace_indice.node_list)
                 if start_idx <= user_idx <= end_idx:
                     chunk_dim = all_node_info[user]["chunk_dim"]
                     if chunk_dim is not None:
-                        user_source = self.trace_index._find_source_trace_from_node(
+                        user_source = self.trace_indice._find_source_trace_from_node(
                             user
                         )[chunk_dim]
                         if input_node_idx in user_source:
@@ -282,7 +282,7 @@ def _get_prepose_nodes(self, all_node_info, start_idx, end_idx):
             if node_info["chunk_dim"] is None:
                 maybe_prepose_nodes.append(node)
         maybe_prepose_nodes.sort(
-            key=lambda x: find_idx_by_name(x.name, self.trace_index.node_list),
+            key=lambda x: find_idx_by_name(x.name, self.trace_indice.node_list),
             reverse=True,
         )  # from last node to first node
         prepose_nodes = []
@@ -308,7 +308,7 @@ def _get_prepose_nodes(self, all_node_info, start_idx, end_idx):
                         if not (
                             start_idx
                             <= find_idx_by_name(
-                                cur_prepose_node_arg.name, self.trace_index.node_list
+                                cur_prepose_node_arg.name, self.trace_indice.node_list
                             )
                             < end_idx
                         ):
@@ -336,14 +336,14 @@ def _get_prepose_nodes(self, all_node_info, start_idx, end_idx):
                         maybe_prepose_nodes.remove(n)
         # sort by index
         prepose_nodes.sort(
-            key=lambda x: find_idx_by_name(x.name, self.trace_index.node_list)
+            key=lambda x: find_idx_by_name(x.name, self.trace_indice.node_list)
         )
 
         return prepose_nodes
 
     def _get_non_chunk_inputs(self, chunk_info, start_idx, end_idx):
         # we need to log input nodes to avoid deleteing them in the loop
-        chunk_node_list = self.trace_index.node_list[start_idx : end_idx + 1]
+        chunk_node_list = self.trace_indice.node_list[start_idx : end_idx + 1]
         # also need to get some prepose node's arg out of non_chunk_inputs
         for n in chunk_info["args"]["prepose_nodes"]:
             chunk_node_list.remove(n)
@@ -355,7 +355,7 @@ def _get_non_chunk_inputs(self, chunk_info, start_idx, end_idx):
 
     def flow_search(self, start_idx, start_dim, end_idx, end_dim):
         inputs, outputs = find_chunk_compute_input_and_output_nodes(
-            self.trace_index.node_list[start_idx : end_idx + 1]
+            self.trace_indice.node_list[start_idx : end_idx + 1]
         )
         # only single ouput
         if len(outputs) > 1:
@@ -403,10 +403,10 @@ def _reassgin_reshape_size(self, chunk_info):
         chunk_shape = get_node_shape(chunk_info["outputs"][0])[
             chunk_info["outputs_dim"]
         ]
-        for node in self.trace_index.node_list[chunk_region[0] : chunk_region[1] + 1]:
+        for node in self.trace_indice.node_list[chunk_region[0] : chunk_region[1] + 1]:
             if any(i in node.name for i in ["reshape", "view"]):
                 reshape_args = node.args[1:]
-                reshape_log = self.trace_index.idx_view_list[node]
+                reshape_log = self.trace_indice.idx_view_list[node]
                 chunk_dim = chunk_info["node_chunk_dim"][node]["chunk_dim"]
                 reshape_size[node.name] = {}
                 for reshape_arg_dim, reshape_arg in enumerate(reshape_args):
diff --git a/colossalai/autochunk/trace_index.py b/colossalai/autochunk/trace_indice.py
similarity index 99%
rename from colossalai/autochunk/trace_index.py
rename to colossalai/autochunk/trace_indice.py
index 1e8969d8796e..9a04c2a0d71d 100644
--- a/colossalai/autochunk/trace_index.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -6,7 +6,7 @@
 )
 
 
-class TraceIndex(object):
+class TraceIndice(object):
     def __init__(self, node_list) -> None:
         self.node_list = node_list
         self.idx_trace_list = self._init_idx_trace_list()

From cb9817f75df7bb3569088e3f97cabb442373f256 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 17:34:30 +0800
Subject: [PATCH 120/209] rename function from index to indice

---
 colossalai/autochunk/reorder_graph.py |  12 +-
 colossalai/autochunk/search_chunk.py  |   2 +-
 colossalai/autochunk/trace_flow.py    |   2 +-
 colossalai/autochunk/trace_indice.py  | 166 +++++++++++++-------------
 4 files changed, 91 insertions(+), 91 deletions(-)

diff --git a/colossalai/autochunk/reorder_graph.py b/colossalai/autochunk/reorder_graph.py
index 6baa0d2a7d13..2ece0126e91b 100644
--- a/colossalai/autochunk/reorder_graph.py
+++ b/colossalai/autochunk/reorder_graph.py
@@ -6,7 +6,7 @@ class ReorderGraph(object):
     def __init__(self, trace_indice: TraceIndice) -> None:
         self.trace_indice = trace_indice
         self.all_reorder_map = {
-            i: i for i in range(len(self.trace_indice.idx_trace_list))
+            i: i for i in range(len(self.trace_indice.indice_trace_list))
         }
 
     def _get_reorder_map(self, chunk_info):
@@ -60,18 +60,18 @@ def _reorder_self_node_list(self, reorder_map):
 
     def _reorder_idx_trace(self, reorder_map):
         # reorder list
-        new_idx_trace_list = [None for _ in range(len(self.trace_indice.idx_trace_list))]
+        new_idx_trace_list = [None for _ in range(len(self.trace_indice.indice_trace_list))]
         for old_idx, new_idx in reorder_map.items():
-            new_idx_trace_list[new_idx] = self.trace_indice.idx_trace_list[old_idx]
-        self.trace_indice.idx_trace_list = new_idx_trace_list
+            new_idx_trace_list[new_idx] = self.trace_indice.indice_trace_list[old_idx]
+        self.trace_indice.indice_trace_list = new_idx_trace_list
         # update compute
-        for idx_trace in self.trace_indice.idx_trace_list:
+        for idx_trace in self.trace_indice.indice_trace_list:
             compute = idx_trace["compute"]
             for dim_compute in compute:
                 for idx, i in enumerate(dim_compute):
                     dim_compute[idx] = reorder_map[i]
         # update source
-        for idx_trace in self.trace_indice.idx_trace_list:
+        for idx_trace in self.trace_indice.indice_trace_list:
             source = idx_trace["source"]
             for dim_idx, dim_source in enumerate(source):
                 new_dim_source = {}
diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index d90e50927110..67f764a31cc5 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -205,7 +205,7 @@ def _search_possible_chunk_regions(
             possible_chunk_region (List)
         """
         possible_chunk_region = []
-        output_trace = copy.deepcopy(self.trace_indice.idx_trace_list)
+        output_trace = copy.deepcopy(self.trace_indice.indice_trace_list)
         input_trace = []  # trace of a node's input nodes
         for _, n in enumerate(self.trace_indice.node_list):
             cur_trace = {}
diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py
index 33fade1a5463..1e2e6dc1258b 100644
--- a/colossalai/autochunk/trace_flow.py
+++ b/colossalai/autochunk/trace_flow.py
@@ -406,7 +406,7 @@ def _reassgin_reshape_size(self, chunk_info):
         for node in self.trace_indice.node_list[chunk_region[0] : chunk_region[1] + 1]:
             if any(i in node.name for i in ["reshape", "view"]):
                 reshape_args = node.args[1:]
-                reshape_log = self.trace_indice.idx_view_list[node]
+                reshape_log = self.trace_indice.indice_view_list[node]
                 chunk_dim = chunk_info["node_chunk_dim"][node]["chunk_dim"]
                 reshape_size[node.name] = {}
                 for reshape_arg_dim, reshape_arg in enumerate(reshape_args):
diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 9a04c2a0d71d..669bfb30a412 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -9,13 +9,13 @@
 class TraceIndice(object):
     def __init__(self, node_list) -> None:
         self.node_list = node_list
-        self.idx_trace_list = self._init_idx_trace_list()
-        self.idx_trace_equal = []
-        self.idx_view_list = {}
-        self.idx_count = -1
+        self.indice_trace_list = self._init_indice_trace_list()
+        self.indice_trace_equal = []
+        self.indice_view_list = {}
+        self.indice_count = -1
 
-    def _init_idx_trace_list(self):
-        idx_trace_list = []
+    def _init_indice_trace_list(self):
+        indice_trace_list = []
         for n in self.node_list:
             if get_node_shape(n) != None:
                 cur_trace = {
@@ -25,37 +25,37 @@ def _init_idx_trace_list(self):
                 }
             else:
                 cur_trace = {"idx": [], "compute": [], "source": []}
-            idx_trace_list.append(cur_trace)
-        return idx_trace_list
+            indice_trace_list.append(cur_trace)
+        return indice_trace_list
 
-    def _add_index(self):
+    def _add_indice(self):
         """
         Update the count and return it. To record the idx number.
 
         Returns:
             idx_count: int
         """
-        self.idx_count += 1
-        return self.idx_count
+        self.indice_count += 1
+        return self.indice_count
 
     def _del_dim(self, idx, dim_idx):
-        self.idx_trace_list[idx]["idx"].pop(dim_idx)
-        self.idx_trace_list[idx]["compute"].pop(dim_idx)
-        self.idx_trace_list[idx]["source"].pop(dim_idx)
+        self.indice_trace_list[idx]["idx"].pop(dim_idx)
+        self.indice_trace_list[idx]["compute"].pop(dim_idx)
+        self.indice_trace_list[idx]["source"].pop(dim_idx)
 
     def _add_dim(self, node_idx, dim_idx):
-        self.idx_trace_list[node_idx]["idx"].insert(dim_idx, self._add_index())
-        self.idx_trace_list[node_idx]["compute"].insert(dim_idx, [])
-        self.idx_trace_list[node_idx]["source"].insert(dim_idx, {})
+        self.indice_trace_list[node_idx]["idx"].insert(dim_idx, self._add_indice())
+        self.indice_trace_list[node_idx]["compute"].insert(dim_idx, [])
+        self.indice_trace_list[node_idx]["source"].insert(dim_idx, {})
 
-    def _transform_index(self, node, node_dim):
-        node_idx = self._find_idx_trace_from_node(node)
+    def _transform_indice(self, node, node_dim):
+        node_idx = self._find_indice_trace_from_node(node)
         dims = list(range(len(node_idx)))
         return dims[node_dim]
 
-    def _inherit_index(self, node_from, node_from_dim, node_to, node_to_dim):
-        node_from_dim = self._transform_index(node_from, node_from_dim)
-        node_to_dim = self._transform_index(node_to, node_to_dim)
+    def _inherit_indice(self, node_from, node_from_dim, node_to, node_to_dim):
+        node_from_dim = self._transform_indice(node_from, node_from_dim)
+        node_to_dim = self._transform_indice(node_to, node_to_dim)
         node_from_trace = self._find_trace_from_node(node_from)
         node_to_trace = self._find_trace_from_node(node_to)
         node_to_trace["idx"][node_to_dim] = node_from_trace["idx"][node_from_dim]
@@ -73,9 +73,9 @@ def _inherit_all_computation(self, node_from, node_to):
             node_to_compute[i] = copy.deepcopy(node_from_compute[i])
 
     def _add_source(self, node_from, node_from_dim, node_to, node_to_dim, init=False):
-        node_from_dim = self._transform_index(node_from, node_from_dim)
+        node_from_dim = self._transform_indice(node_from, node_from_dim)
         node_from_trace_source = self._find_source_trace_from_node(node_from)
-        node_to_dim = self._transform_index(node_to, node_to_dim)
+        node_to_dim = self._transform_indice(node_to, node_to_dim)
         node_to_trace_source = self._find_source_trace_from_node(node_to)
         node_from_idx = find_idx_by_name(node_from.name, self.node_list)
         if init:
@@ -99,19 +99,19 @@ def _mark_computation_from_node(self, node_from, node_to, exclude=None):
         if exclude == None:
             exclude = []
         else:
-            exclude = [self._transform_index(node_to, i) for i in exclude]
+            exclude = [self._transform_indice(node_to, i) for i in exclude]
         node_from_compute = self._find_compute_trace_from_node(node_from)
         node_to_compute = self._find_compute_trace_from_node(node_to)
         # assert len(node_from_compute) == len(node_to_compute)
         for i in range(-1, -min(len(node_from_compute), len(node_to_compute)) - 1, -1):
-            if self._transform_index(node_to, i) in exclude:
+            if self._transform_indice(node_to, i) in exclude:
                 continue
             self._add_source(node_from, i, node_to, i)
             for j in node_from_compute[i]:
                 if j not in node_to_compute[i]:
                     node_to_compute[i].append(j)
 
-    def _mark_idx_equal(self, node1, dim1, node2, dim2):
+    def _mark_indice_equal(self, node1, dim1, node2, dim2):
         """
         Mark 2 index to be equal.
 
@@ -140,8 +140,8 @@ def _mark_computation(self, node, idx, dim):
         dims = list(range(len(get_node_shape(node))))
         for d in dim:
             cur_dim = dims[d]
-            if idx not in self.idx_trace_list[idx]["compute"][cur_dim]:
-                self.idx_trace_list[idx]["compute"][cur_dim].append(idx)
+            if idx not in self.indice_trace_list[idx]["compute"][cur_dim]:
+                self.indice_trace_list[idx]["compute"][cur_dim].append(idx)
 
     def _find_trace_from_node(self, node):
         """
@@ -154,7 +154,7 @@ def _find_trace_from_node(self, node):
             compute (list): computed idx of the node.
         """
         node_idx = find_idx_by_name(node.name, self.node_list)
-        node_dict = self.idx_trace_list[node_idx]
+        node_dict = self.indice_trace_list[node_idx]
         return node_dict
 
     def _find_source_trace_from_node(self, node):
@@ -168,10 +168,10 @@ def _find_source_trace_from_node(self, node):
             compute (list): computed idx of the node.
         """
         node_idx = find_idx_by_name(node.name, self.node_list)
-        node_dict = self.idx_trace_list[node_idx]
+        node_dict = self.indice_trace_list[node_idx]
         return node_dict["source"]
 
-    def _find_idx_trace_from_node(self, node):
+    def _find_indice_trace_from_node(self, node):
         """
         Find node idx trace by the node.
 
@@ -181,7 +181,7 @@ def _find_idx_trace_from_node(self, node):
             idx (list): idx of the node
         """
         node_idx = find_idx_by_name(node.name, self.node_list)
-        return self.idx_trace_list[node_idx]["idx"]
+        return self.indice_trace_list[node_idx]["idx"]
 
     def _find_compute_trace_from_node(self, node):
         """
@@ -193,7 +193,7 @@ def _find_compute_trace_from_node(self, node):
             compute (list): computed idx of the node.
         """
         node_idx = find_idx_by_name(node.name, self.node_list)
-        return self.idx_trace_list[node_idx]["compute"]
+        return self.indice_trace_list[node_idx]["compute"]
 
     def _assign_index_as_input(self, node, node_idx, input_node=None):
         """
@@ -206,14 +206,14 @@ def _assign_index_as_input(self, node, node_idx, input_node=None):
         if input_node == None:
             input_node = node.args[0]
         input_node_idx = find_idx_by_name(input_node.name, self.node_list)
-        input_node_idx_trace = self.idx_trace_list[input_node_idx]["idx"]
+        input_node_idx_trace = self.indice_trace_list[input_node_idx]["idx"]
 
         new_idx_trace = copy.deepcopy(input_node_idx_trace)
-        self.idx_trace_list[node_idx]["idx"] = new_idx_trace
+        self.indice_trace_list[node_idx]["idx"] = new_idx_trace
 
         self._inherit_all_computation(input_node, node)
 
-    def _assign_all_index(self, node, node_idx):
+    def _assign_all_indice(self, node, node_idx):
         """
         Add new index for all node's dims.
 
@@ -224,10 +224,10 @@ def _assign_all_index(self, node, node_idx):
         shape = node.meta["tensor_meta"].shape
         new_trace = []
         for _ in shape:
-            new_trace.append(self._add_index())
-        self.idx_trace_list[node_idx]["idx"] = new_trace
+            new_trace.append(self._add_indice())
+        self.indice_trace_list[node_idx]["idx"] = new_trace
 
-    def _assign_transpose_index(self, node, node_idx):
+    def _assign_transpose_indice(self, node, node_idx):
         """
         Assign index for transpose op.
         1. swap input's dim according to transpose args
@@ -241,10 +241,10 @@ def _assign_transpose_index(self, node, node_idx):
         tranpose_dim = node.args[1:]
 
         self._assign_index_as_input(node, node_idx, input_node)
-        self._inherit_index(input_node, tranpose_dim[1], node, tranpose_dim[0])
-        self._inherit_index(input_node, tranpose_dim[0], node, tranpose_dim[1])
+        self._inherit_indice(input_node, tranpose_dim[1], node, tranpose_dim[0])
+        self._inherit_indice(input_node, tranpose_dim[0], node, tranpose_dim[1])
 
-    def _assign_permute_index(self, node, node_idx):
+    def _assign_permute_indice(self, node, node_idx):
         """
         Assign index for permute op.
         1. swap input's dim according to permute args
@@ -259,9 +259,9 @@ def _assign_permute_index(self, node, node_idx):
 
         self._assign_index_as_input(node, node_idx, input_node)
         for idx, d in enumerate(permute_dim):
-            self._inherit_index(input_node, d, node, idx)
+            self._inherit_indice(input_node, d, node, idx)
 
-    def _assign_linear_index(self, node, node_idx):
+    def _assign_linear_indice(self, node, node_idx):
         """
         Assign index for linear op.
         1. copy trace from input node and change last index accroding to weight
@@ -279,15 +279,15 @@ def _assign_linear_index(self, node, node_idx):
             input_node, weight, bias = node.args
 
         self._assign_index_as_input(node, node_idx)
-        self._inherit_index(weight, 1, node, -1)
+        self._inherit_indice(weight, 1, node, -1)
 
         self._mark_computation(node, node_idx, [-1])
-        self._mark_idx_equal(input_node, -1, weight, 0)
+        self._mark_indice_equal(input_node, -1, weight, 0)
 
         if bias:
-            self._mark_idx_equal(input_node, -1, bias, 0)
+            self._mark_indice_equal(input_node, -1, bias, 0)
 
-    def _assign_matmul_index(self, node, node_idx):
+    def _assign_matmul_indice(self, node, node_idx):
         """
         Assign index for matmul op.
         1. copy trace from matmul_left and change last index accroding to matmul_right. (assert they have same length)
@@ -302,13 +302,13 @@ def _assign_matmul_index(self, node, node_idx):
 
         assert len(get_node_shape(matmul_left)) == len(get_node_shape(matmul_right))
         self._assign_index_as_input(node, node_idx, matmul_left)
-        self._inherit_index(matmul_right, -1, node, -1)
+        self._inherit_indice(matmul_right, -1, node, -1)
 
         self._mark_computation_from_node(matmul_right, node, [-1, -2])
         self._mark_computation(node, node_idx, [-1])
-        self._mark_idx_equal(matmul_left, -1, matmul_right, -2)
+        self._mark_indice_equal(matmul_left, -1, matmul_right, -2)
 
-    def _assign_layernorm_index(self, node, idx):
+    def _assign_layernorm_indice(self, node, idx):
         """
         Assign index for layernorm op.
         1. assign index as input node
@@ -321,7 +321,7 @@ def _assign_layernorm_index(self, node, idx):
         self._assign_index_as_input(node, idx)
         self._mark_computation(node, idx, [-1])
 
-    def _assign_elementwise_index(self, node, idx):
+    def _assign_elementwise_indice(self, node, idx):
         """
         Assign index for element-wise op (eg. relu sigmoid add mul).
         1. assign index as input node
@@ -343,15 +343,15 @@ def _assign_elementwise_index(self, node, idx):
             node_in1_shape = get_node_shape(nodes_in[1])
             for i in range(-1, -min(len(node_in0_shape), len(node_in1_shape)) - 1, -1):
                 if node_in0_shape[i] == node_in1_shape[i]:
-                    self._mark_idx_equal(nodes_in[0], i, nodes_in[1], i)
+                    self._mark_indice_equal(nodes_in[0], i, nodes_in[1], i)
 
-    def _assgin_no_change_index(self, node, idx):
+    def _assgin_no_change_indice(self, node, idx):
         self._assign_index_as_input(node, idx)
         for node_in in node.args:
             if type(node_in) == type(node):
                 self._mark_computation_from_node(node_in, node)
 
-    def _assign_einsum_index(self, node, idx):
+    def _assign_einsum_indice(self, node, idx):
         """
         Assign index for einsum op.
 
@@ -378,7 +378,7 @@ def _assign_einsum_index(self, node, idx):
             for left_idx, left_str in enumerate(left):
                 if right_indice in left_str:
                     source_idx = left_str.index(right_indice)
-                    self._inherit_index(
+                    self._inherit_indice(
                         input_nodes[left_idx], source_idx, node, right_idx
                     )
 
@@ -388,7 +388,7 @@ def _assign_einsum_index(self, node, idx):
         #             self._mark_computation(node, idx, left_str.index(i))
         #             break
 
-    def _assign_softmax_index(self, node, idx):
+    def _assign_softmax_indice(self, node, idx):
         """
         Assign index for softmax op.
         1. assign index as input node
@@ -401,7 +401,7 @@ def _assign_softmax_index(self, node, idx):
         self._assign_index_as_input(node, idx)
         self._mark_computation(node, idx, [node.kwargs["dim"]])
 
-    def _assign_unsqueeze_index(self, node, node_idx):
+    def _assign_unsqueeze_indice(self, node, node_idx):
         """
         Assign index for unsqueeze op.
         1. assign new index for unsqueeze dim
@@ -414,7 +414,7 @@ def _assign_unsqueeze_index(self, node, node_idx):
         self._assign_index_as_input(node, node_idx)
         self._add_dim(node_idx, node.args[1])
 
-    def _assign_dropout_index(self, node, node_idx):
+    def _assign_dropout_indice(self, node, node_idx):
         """
         Assign index for unsqueeze op.
         1. assign new index for unsqueeze dim
@@ -425,7 +425,7 @@ def _assign_dropout_index(self, node, node_idx):
         """
         self._assign_index_as_input(node, node_idx)
 
-    def _assign_ones_like_index(self, node, node_idx):
+    def _assign_ones_like_indice(self, node, node_idx):
         """
         Assign index for oneslike op.
         1. assign new index for all dim
@@ -434,9 +434,9 @@ def _assign_ones_like_index(self, node, node_idx):
             node (node)
             node_idx (int)
         """
-        self._assign_all_index(node, node_idx)
+        self._assign_all_indice(node, node_idx)
 
-    def _assign_view_reshape_index(self, node, node_idx):
+    def _assign_view_reshape_indice(self, node, node_idx):
         """
         Assign index for view and reshape op.
         1. get origin shape and target shape by meta info.
@@ -496,7 +496,7 @@ def _assign_view_reshape_index(self, node, node_idx):
             )
 
         # get new index
-        origin_trace = self._find_idx_trace_from_node(origin_node)
+        origin_trace = self._find_indice_trace_from_node(origin_node)
         self._assign_index_as_input(node, node_idx, origin_node)
         dim_from.reverse()
         for i in dim_from:
@@ -516,18 +516,18 @@ def _assign_view_reshape_index(self, node, node_idx):
         view_dict = {
             "idx_from": [origin_trace[i] for i in dim_from],
             "dim_from": dim_from,
-            "idx_to": [self.idx_trace_list[node_idx]["idx"][i] for i in dim_to],
+            "idx_to": [self.indice_trace_list[node_idx]["idx"][i] for i in dim_to],
             "dim_to": dim_to,
         }
-        self.idx_view_list[node] = view_dict
+        self.indice_view_list[node] = view_dict
 
     def _merge_equal_idx(self):
-        idx_equal = copy.deepcopy(self.idx_trace_equal)
+        idx_equal = copy.deepcopy(self.indice_trace_equal)
         idx_equal.reverse()
         for idx in idx_equal:
             merge_to = min(idx)
             merge_from = max(idx)
-            for trace in self.idx_trace_list:
+            for trace in self.indice_trace_list:
                 if merge_from in trace["idx"]:
                     trace["idx"] = [
                         merge_to if i == merge_from else i for i in trace["idx"]
@@ -536,35 +536,35 @@ def _merge_equal_idx(self):
     def trace_index(self):
         for idx, node in enumerate(self.node_list):
             if node.op == "placeholder":
-                self._assign_all_index(node, idx)
+                self._assign_all_indice(node, idx)
             elif node.op == "call_method":
                 if "transpose" in node.name:
-                    self._assign_transpose_index(node, idx)
+                    self._assign_transpose_indice(node, idx)
                 elif "permute" in node.name:
-                    self._assign_permute_index(node, idx)
+                    self._assign_permute_indice(node, idx)
                 elif "view" in node.name or "reshape" in node.name:
-                    self._assign_view_reshape_index(node, idx)
+                    self._assign_view_reshape_indice(node, idx)
                 elif "unsqueeze" in node.name:
-                    self._assign_unsqueeze_index(node, idx)
+                    self._assign_unsqueeze_indice(node, idx)
                 elif any(i in node.name for i in ["to", "contiguous"]):
-                    self._assgin_no_change_index(node, idx)
+                    self._assgin_no_change_indice(node, idx)
                 else:
                     raise NotImplementedError(node.name, "method not implemented yet!")
             elif node.op == "call_function":
                 if "linear" in node.name:
-                    self._assign_linear_index(node, idx)
+                    self._assign_linear_indice(node, idx)
                 elif "matmul" in node.name:
-                    self._assign_matmul_index(node, idx)
+                    self._assign_matmul_indice(node, idx)
                 elif "softmax" in node.name:
-                    self._assign_softmax_index(node, idx)
+                    self._assign_softmax_indice(node, idx)
                 elif any(n in node.name for n in ["mul", "add", "sigmoid", "relu"]):
-                    self._assign_elementwise_index(node, idx)
+                    self._assign_elementwise_indice(node, idx)
                 elif "ones_like" in node.name:
-                    self._assign_ones_like_index(node, idx)
+                    self._assign_ones_like_indice(node, idx)
                 elif "dropout" in node.name:
-                    self._assign_dropout_index(node, idx)
+                    self._assign_dropout_indice(node, idx)
                 elif "einsum" in node.name:
-                    self._assign_einsum_index(node, idx)
+                    self._assign_einsum_indice(node, idx)
                 elif "getattr" in node.name:
                     continue  # get attr like shape
                 elif "getitem" in node.name:
@@ -575,11 +575,11 @@ def trace_index(self):
                     )
             elif node.op == "call_module":
                 if any(n in node.name for n in ["layernorm", "norm"]):
-                    self._assign_layernorm_index(node, idx)
+                    self._assign_layernorm_indice(node, idx)
                 else:
                     raise NotImplementedError(node.name, "module not implemented yet!")
             elif node.op == "get_attr":
-                self._assign_all_index(node, idx)  # get param
+                self._assign_all_indice(node, idx)  # get param
             elif node.op == "output":
                 continue
             else:

From 1bb1f2ad8989bf2b0abc923aeff39c0c1b446e1b Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 17:38:16 +0800
Subject: [PATCH 121/209] rename

---
 colossalai/autochunk/search_chunk.py |  4 +--
 colossalai/autochunk/trace_indice.py | 50 ++++++++++++++--------------
 2 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index 67f764a31cc5..eee357073f2f 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -158,11 +158,11 @@ def _find_chunk_info(self, input_trace, output_trace, start_idx, end_idx) -> Lis
         end_trace = output_trace[end_idx]
         end_node = self.trace_indice.node_list[end_idx]
         chunk_infos = []
-        for end_dim, _ in enumerate(end_trace["idx"]):
+        for end_dim, _ in enumerate(end_trace["indice"]):
             if len(start_traces) > 1:
                 continue
             for start_node, start_trace in start_traces.items():
-                for start_dim, _ in enumerate(start_trace["idx"]):
+                for start_dim, _ in enumerate(start_trace["indice"]):
                     # dim size cannot be 1
                     if (
                         get_node_shape(end_node)[end_dim] == 1
diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 669bfb30a412..791e5a36e357 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -19,12 +19,12 @@ def _init_indice_trace_list(self):
         for n in self.node_list:
             if get_node_shape(n) != None:
                 cur_trace = {
-                    "idx": [None for _ in range(len(get_node_shape(n)))],
+                    "indice": [None for _ in range(len(get_node_shape(n)))],
                     "compute": [[] for _ in range(len(get_node_shape(n)))],
                     "source": [{} for _ in range(len(get_node_shape(n)))],
                 }
             else:
-                cur_trace = {"idx": [], "compute": [], "source": []}
+                cur_trace = {"indice": [], "compute": [], "source": []}
             indice_trace_list.append(cur_trace)
         return indice_trace_list
 
@@ -39,12 +39,12 @@ def _add_indice(self):
         return self.indice_count
 
     def _del_dim(self, idx, dim_idx):
-        self.indice_trace_list[idx]["idx"].pop(dim_idx)
+        self.indice_trace_list[idx]["indice"].pop(dim_idx)
         self.indice_trace_list[idx]["compute"].pop(dim_idx)
         self.indice_trace_list[idx]["source"].pop(dim_idx)
 
     def _add_dim(self, node_idx, dim_idx):
-        self.indice_trace_list[node_idx]["idx"].insert(dim_idx, self._add_indice())
+        self.indice_trace_list[node_idx]["indice"].insert(dim_idx, self._add_indice())
         self.indice_trace_list[node_idx]["compute"].insert(dim_idx, [])
         self.indice_trace_list[node_idx]["source"].insert(dim_idx, {})
 
@@ -58,7 +58,7 @@ def _inherit_indice(self, node_from, node_from_dim, node_to, node_to_dim):
         node_to_dim = self._transform_indice(node_to, node_to_dim)
         node_from_trace = self._find_trace_from_node(node_from)
         node_to_trace = self._find_trace_from_node(node_to)
-        node_to_trace["idx"][node_to_dim] = node_from_trace["idx"][node_from_dim]
+        node_to_trace["indice"][node_to_dim] = node_from_trace["indice"][node_from_dim]
         node_to_trace["compute"][node_to_dim] = copy.deepcopy(
             node_from_trace["compute"][node_from_dim]
         )
@@ -181,7 +181,7 @@ def _find_indice_trace_from_node(self, node):
             idx (list): idx of the node
         """
         node_idx = find_idx_by_name(node.name, self.node_list)
-        return self.indice_trace_list[node_idx]["idx"]
+        return self.indice_trace_list[node_idx]["indice"]
 
     def _find_compute_trace_from_node(self, node):
         """
@@ -195,7 +195,7 @@ def _find_compute_trace_from_node(self, node):
         node_idx = find_idx_by_name(node.name, self.node_list)
         return self.indice_trace_list[node_idx]["compute"]
 
-    def _assign_index_as_input(self, node, node_idx, input_node=None):
+    def _assign_indice_as_input(self, node, node_idx, input_node=None):
         """
         Assign node's trace as its input node.
 
@@ -206,10 +206,10 @@ def _assign_index_as_input(self, node, node_idx, input_node=None):
         if input_node == None:
             input_node = node.args[0]
         input_node_idx = find_idx_by_name(input_node.name, self.node_list)
-        input_node_idx_trace = self.indice_trace_list[input_node_idx]["idx"]
+        input_node_idx_trace = self.indice_trace_list[input_node_idx]["indice"]
 
         new_idx_trace = copy.deepcopy(input_node_idx_trace)
-        self.indice_trace_list[node_idx]["idx"] = new_idx_trace
+        self.indice_trace_list[node_idx]["indice"] = new_idx_trace
 
         self._inherit_all_computation(input_node, node)
 
@@ -225,7 +225,7 @@ def _assign_all_indice(self, node, node_idx):
         new_trace = []
         for _ in shape:
             new_trace.append(self._add_indice())
-        self.indice_trace_list[node_idx]["idx"] = new_trace
+        self.indice_trace_list[node_idx]["indice"] = new_trace
 
     def _assign_transpose_indice(self, node, node_idx):
         """
@@ -240,7 +240,7 @@ def _assign_transpose_indice(self, node, node_idx):
         input_node = node.args[0]
         tranpose_dim = node.args[1:]
 
-        self._assign_index_as_input(node, node_idx, input_node)
+        self._assign_indice_as_input(node, node_idx, input_node)
         self._inherit_indice(input_node, tranpose_dim[1], node, tranpose_dim[0])
         self._inherit_indice(input_node, tranpose_dim[0], node, tranpose_dim[1])
 
@@ -257,7 +257,7 @@ def _assign_permute_indice(self, node, node_idx):
         permute_dim = node.args[1:]
         input_node = node.args[0]
 
-        self._assign_index_as_input(node, node_idx, input_node)
+        self._assign_indice_as_input(node, node_idx, input_node)
         for idx, d in enumerate(permute_dim):
             self._inherit_indice(input_node, d, node, idx)
 
@@ -278,7 +278,7 @@ def _assign_linear_indice(self, node, node_idx):
         else:
             input_node, weight, bias = node.args
 
-        self._assign_index_as_input(node, node_idx)
+        self._assign_indice_as_input(node, node_idx)
         self._inherit_indice(weight, 1, node, -1)
 
         self._mark_computation(node, node_idx, [-1])
@@ -301,7 +301,7 @@ def _assign_matmul_indice(self, node, node_idx):
         matmul_left, matmul_right = node.args
 
         assert len(get_node_shape(matmul_left)) == len(get_node_shape(matmul_right))
-        self._assign_index_as_input(node, node_idx, matmul_left)
+        self._assign_indice_as_input(node, node_idx, matmul_left)
         self._inherit_indice(matmul_right, -1, node, -1)
 
         self._mark_computation_from_node(matmul_right, node, [-1, -2])
@@ -318,7 +318,7 @@ def _assign_layernorm_indice(self, node, idx):
             node (node)
             node_idx (int)
         """
-        self._assign_index_as_input(node, idx)
+        self._assign_indice_as_input(node, idx)
         self._mark_computation(node, idx, [-1])
 
     def _assign_elementwise_indice(self, node, idx):
@@ -331,7 +331,7 @@ def _assign_elementwise_indice(self, node, idx):
             node (node)
             node_idx (int)
         """
-        self._assign_index_as_input(node, idx)
+        self._assign_indice_as_input(node, idx)
         nodes_in = []
         for node_in in node.args:
             if type(node_in) == type(node):
@@ -346,7 +346,7 @@ def _assign_elementwise_indice(self, node, idx):
                     self._mark_indice_equal(nodes_in[0], i, nodes_in[1], i)
 
     def _assgin_no_change_indice(self, node, idx):
-        self._assign_index_as_input(node, idx)
+        self._assign_indice_as_input(node, idx)
         for node_in in node.args:
             if type(node_in) == type(node):
                 self._mark_computation_from_node(node_in, node)
@@ -398,7 +398,7 @@ def _assign_softmax_indice(self, node, idx):
             node (node)
             node_idx (int)
         """
-        self._assign_index_as_input(node, idx)
+        self._assign_indice_as_input(node, idx)
         self._mark_computation(node, idx, [node.kwargs["dim"]])
 
     def _assign_unsqueeze_indice(self, node, node_idx):
@@ -411,7 +411,7 @@ def _assign_unsqueeze_indice(self, node, node_idx):
             node_idx (int)
         """
         self._del_dim(node_idx, -1)
-        self._assign_index_as_input(node, node_idx)
+        self._assign_indice_as_input(node, node_idx)
         self._add_dim(node_idx, node.args[1])
 
     def _assign_dropout_indice(self, node, node_idx):
@@ -423,7 +423,7 @@ def _assign_dropout_indice(self, node, node_idx):
             node (node)
             node_idx (int)
         """
-        self._assign_index_as_input(node, node_idx)
+        self._assign_indice_as_input(node, node_idx)
 
     def _assign_ones_like_indice(self, node, node_idx):
         """
@@ -497,7 +497,7 @@ def _assign_view_reshape_indice(self, node, node_idx):
 
         # get new index
         origin_trace = self._find_indice_trace_from_node(origin_node)
-        self._assign_index_as_input(node, node_idx, origin_node)
+        self._assign_indice_as_input(node, node_idx, origin_node)
         dim_from.reverse()
         for i in dim_from:
             self._del_dim(node_idx, i)
@@ -516,7 +516,7 @@ def _assign_view_reshape_indice(self, node, node_idx):
         view_dict = {
             "idx_from": [origin_trace[i] for i in dim_from],
             "dim_from": dim_from,
-            "idx_to": [self.indice_trace_list[node_idx]["idx"][i] for i in dim_to],
+            "idx_to": [self.indice_trace_list[node_idx]["indice"][i] for i in dim_to],
             "dim_to": dim_to,
         }
         self.indice_view_list[node] = view_dict
@@ -528,9 +528,9 @@ def _merge_equal_idx(self):
             merge_to = min(idx)
             merge_from = max(idx)
             for trace in self.indice_trace_list:
-                if merge_from in trace["idx"]:
-                    trace["idx"] = [
-                        merge_to if i == merge_from else i for i in trace["idx"]
+                if merge_from in trace["indice"]:
+                    trace["indice"] = [
+                        merge_to if i == merge_from else i for i in trace["indice"]
                     ]
 
     def trace_index(self):

From a4ed5b0d0d926f9e3f84711799e21db795a339e9 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 17:41:26 +0800
Subject: [PATCH 122/209] rename in doc

---
 colossalai/autochunk/trace_indice.py | 66 ++++++++++++----------------
 1 file changed, 29 insertions(+), 37 deletions(-)

diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 791e5a36e357..71b5c16dc04d 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -33,7 +33,7 @@ def _add_indice(self):
         Update the count and return it. To record the idx number.
 
         Returns:
-            idx_count: int
+            indice_count: int
         """
         self.indice_count += 1
         return self.indice_count
@@ -113,11 +113,11 @@ def _mark_computation_from_node(self, node_from, node_to, exclude=None):
 
     def _mark_indice_equal(self, node1, dim1, node2, dim2):
         """
-        Mark 2 index to be equal.
+        Mark 2 indice to be equal.
 
         Args:
-            idx1 (int): index count.
-            idx2 (int): index count.
+            idx1 (int): indice count.
+            idx2 (int): indice count.
         """
         # node1_idx = _find_idx_by_name(node1.name, self.nodes_list)
         # node2_idx = _find_idx_by_name(node2.name, self.nodes_list)
@@ -215,7 +215,7 @@ def _assign_indice_as_input(self, node, node_idx, input_node=None):
 
     def _assign_all_indice(self, node, node_idx):
         """
-        Add new index for all node's dims.
+        Add new indice for all node's dims.
 
         Args:
             node (node)
@@ -229,7 +229,7 @@ def _assign_all_indice(self, node, node_idx):
 
     def _assign_transpose_indice(self, node, node_idx):
         """
-        Assign index for transpose op.
+        Assign indice for transpose op.
         1. swap input's dim according to transpose args
         2. inherit input's computation
 
@@ -246,7 +246,7 @@ def _assign_transpose_indice(self, node, node_idx):
 
     def _assign_permute_indice(self, node, node_idx):
         """
-        Assign index for permute op.
+        Assign indice for permute op.
         1. swap input's dim according to permute args
         2. inherit input's computation
 
@@ -263,9 +263,9 @@ def _assign_permute_indice(self, node, node_idx):
 
     def _assign_linear_indice(self, node, node_idx):
         """
-        Assign index for linear op.
-        1. copy trace from input node and change last index accroding to weight
-        2. mark equal for input node last index, weight first dim and bias dim.
+        Assign indice for linear op.
+        1. copy trace from input node and change last indice accroding to weight
+        2. mark equal for input node last indice, weight first dim and bias dim.
         3. inherit input's computation, mark computation for last dim.
 
         Args:
@@ -289,9 +289,9 @@ def _assign_linear_indice(self, node, node_idx):
 
     def _assign_matmul_indice(self, node, node_idx):
         """
-        Assign index for matmul op.
-        1. copy trace from matmul_left and change last index accroding to matmul_right. (assert they have same length)
-        2. mark equal for input matmul_left -1 index and matmul_right -2 dim.
+        Assign indice for matmul op.
+        1. copy trace from matmul_left and change last indice accroding to matmul_right. (assert they have same length)
+        2. mark equal for input matmul_left -1 indice and matmul_right -2 dim.
         3. inherit matmul_left and matmul_right computation, mark computation for last dim.
 
         Args:
@@ -310,8 +310,8 @@ def _assign_matmul_indice(self, node, node_idx):
 
     def _assign_layernorm_indice(self, node, idx):
         """
-        Assign index for layernorm op.
-        1. assign index as input node
+        Assign indice for layernorm op.
+        1. assign indice as input node
         2. inherit computation and mark last 2 dims as computed.
 
         Args:
@@ -323,8 +323,8 @@ def _assign_layernorm_indice(self, node, idx):
 
     def _assign_elementwise_indice(self, node, idx):
         """
-        Assign index for element-wise op (eg. relu sigmoid add mul).
-        1. assign index as input node
+        Assign indice for element-wise op (eg. relu sigmoid add mul).
+        1. assign indice as input node
         2. inherit computation from all input nodes.
 
         Args:
@@ -353,7 +353,7 @@ def _assgin_no_change_indice(self, node, idx):
 
     def _assign_einsum_indice(self, node, idx):
         """
-        Assign index for einsum op.
+        Assign indice for einsum op.
 
         Args:
             node (node)
@@ -371,8 +371,6 @@ def _assign_einsum_indice(self, node, idx):
             for c in i:
                 all_index.append(c)
         all_index = set(all_index)
-        free_index = set([i for i in right])
-        sum_index = all_index - free_index
 
         for right_idx, right_indice in enumerate(right):
             for left_idx, left_str in enumerate(left):
@@ -382,16 +380,10 @@ def _assign_einsum_indice(self, node, idx):
                         input_nodes[left_idx], source_idx, node, right_idx
                     )
 
-        # for i in sum_index:
-        #     for left_idx, left_str in enumerate(left):
-        #         if i in left_str:
-        #             self._mark_computation(node, idx, left_str.index(i))
-        #             break
-
     def _assign_softmax_indice(self, node, idx):
         """
-        Assign index for softmax op.
-        1. assign index as input node
+        Assign indice for softmax op.
+        1. assign indice as input node
         2. inherit computation and mark softmax dim as computed.
 
         Args:
@@ -403,8 +395,8 @@ def _assign_softmax_indice(self, node, idx):
 
     def _assign_unsqueeze_indice(self, node, node_idx):
         """
-        Assign index for unsqueeze op.
-        1. assign new index for unsqueeze dim
+        Assign indice for unsqueeze op.
+        1. assign new indice for unsqueeze dim
 
         Args:
             node (node)
@@ -416,8 +408,8 @@ def _assign_unsqueeze_indice(self, node, node_idx):
 
     def _assign_dropout_indice(self, node, node_idx):
         """
-        Assign index for unsqueeze op.
-        1. assign new index for unsqueeze dim
+        Assign indice for unsqueeze op.
+        1. assign new indice for unsqueeze dim
 
         Args:
             node (node)
@@ -427,8 +419,8 @@ def _assign_dropout_indice(self, node, node_idx):
 
     def _assign_ones_like_indice(self, node, node_idx):
         """
-        Assign index for oneslike op.
-        1. assign new index for all dim
+        Assign indice for oneslike op.
+        1. assign new indice for all dim
 
         Args:
             node (node)
@@ -438,10 +430,10 @@ def _assign_ones_like_indice(self, node, node_idx):
 
     def _assign_view_reshape_indice(self, node, node_idx):
         """
-        Assign index for view and reshape op.
+        Assign indice for view and reshape op.
         1. get origin shape and target shape by meta info.
         2. compute the real value of -1 in target shape.
-        3. determine changed dim, and assgin index for generated dim.
+        3. determine changed dim, and assgin indice for generated dim.
         4. log changed dim and generated dim for restore
         5. inherit computation.
         6. TODO: look into view list to see whether the view is associated with other,
@@ -495,7 +487,7 @@ def _assign_view_reshape_indice(self, node, node_idx):
                 + "view not implemented"
             )
 
-        # get new index
+        # get new indice
         origin_trace = self._find_indice_trace_from_node(origin_node)
         self._assign_indice_as_input(node, node_idx, origin_node)
         dim_from.reverse()

From ea13a201bbd7eb6022069c8379f3626f9788b0f9 Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Mon, 9 Jan 2023 17:41:38 +0800
Subject: [PATCH 123/209] [polish] polish code for get_static_torch_model
 (#2405)

* [gemini] polish code

* [testing] remove code

* [gemini] make more robust
---
 colossalai/nn/parallel/data_parallel.py       | 24 +++++++++----------
 colossalai/nn/parallel/utils.py               |  9 ++++---
 tests/test_gemini/update/test_grad_clip.py    |  2 --
 tests/test_gemini/update/test_optim.py        |  2 --
 .../update/test_zeroddp_state_dict.py         |  4 ----
 tests/test_tensor/test_tp_with_zero.py        |  2 --
 6 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/colossalai/nn/parallel/data_parallel.py b/colossalai/nn/parallel/data_parallel.py
index 8fd08db957b7..a7d79be160d0 100644
--- a/colossalai/nn/parallel/data_parallel.py
+++ b/colossalai/nn/parallel/data_parallel.py
@@ -334,10 +334,9 @@ def set_chunk_grad_device(self, chunk: Chunk, device: torch.device) -> None:
             self.grads_device[tensor] = device
 
     def state_dict(self, destination=None, prefix='', keep_vars=False, only_rank_0: bool = True, strict: bool = True):
-        r"""
+        """
         Args:
-            strict (bool): whether to reture the whole model state
-                as the original pytorch state_dict()
+            strict (bool): whether to reture the whole model state as the pytorch `Module.state_dict()`
 
         Returns:
             dict:
@@ -349,25 +348,24 @@ def state_dict(self, destination=None, prefix='', keep_vars=False, only_rank_0:
             ['bias', 'weight']
         """
         if strict:
-            return get_static_torch_model(zero_ddp_model=self, device=get_current_device(),
-                                          only_rank_0=only_rank_0).state_dict(destination=destination,
-                                                                              prefix=prefix,
-                                                                              keep_vars=keep_vars)
+            assert keep_vars is False, "`state_dict` with parameter, `keep_vars=True`, is not supported now."
+            torch_model = get_static_torch_model(zero_ddp_model=self, only_rank_0=only_rank_0)
+            return torch_model.state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
         return self._non_strict_state_dict(destination=destination,
                                            prefix=prefix,
                                            keep_vars=keep_vars,
                                            only_rank_0=only_rank_0)
 
     def _non_strict_state_dict(self, destination=None, prefix='', keep_vars=False, only_rank_0: bool = True):
-        r"""Returns a dictionary containing a whole state of the module.
+        """Returns a dictionary containing a whole state of the module.
 
-        Both parameters and persistent buffers (e.g. running averages) are
-        included. Keys are corresponding parameter and buffer names.
+        Both parameters and persistent buffers (e.g. running averages) are included.
+        Keys are corresponding parameter and buffer names.
         Parameters and buffers set to ``None`` are not included.
 
-        Warning: The non strict state dict would ignore the parameters if the
-            tensors of the parameters are shared with other parameters which
-            have been included in the dictionary.
+        Warning: The non strict state dict would ignore the parameters if the tensors of the parameters
+            are shared with other parameters which have been included in the dictionary.
+            When you need to load the state dict, you should set the argument `strict` to False.
 
         Returns:
             dict:
diff --git a/colossalai/nn/parallel/utils.py b/colossalai/nn/parallel/utils.py
index 988f978254a1..d323556d5f72 100644
--- a/colossalai/nn/parallel/utils.py
+++ b/colossalai/nn/parallel/utils.py
@@ -47,17 +47,16 @@ def _get_shallow_copy_model(model: nn.Module):
     """Get a shallow copy of the given model. Each submodule is different from the original submodule.
     But the new submodule and the old submodule share all attributes.
     """
-    name_to_module = dict()
+    old_to_new = dict()
     for name, module in _get_dfs_module_list(model):
         new_module = copy(module)
         new_module._modules = OrderedDict()
         for subname, submodule in module._modules.items():
             if submodule is None:
                 continue
-            full_name = name + ('.' if name else '') + subname
-            setattr(new_module, subname, name_to_module[full_name])
-        name_to_module[name] = new_module
-    return name_to_module['']
+            setattr(new_module, subname, old_to_new[submodule])
+        old_to_new[module] = new_module
+    return old_to_new[model]
 
 
 def get_static_torch_model(zero_ddp_model,
diff --git a/tests/test_gemini/update/test_grad_clip.py b/tests/test_gemini/update/test_grad_clip.py
index 185521edb357..fda1cf8cfd14 100644
--- a/tests/test_gemini/update/test_grad_clip.py
+++ b/tests/test_gemini/update/test_grad_clip.py
@@ -31,8 +31,6 @@ def check_param(model: ZeroDDP, torch_model: torch.nn.Module):
     for key, value in torch_dict.items():
         # key is 'module.model.PARAMETER', so we truncate it
         key = key[7:]
-        if key == 'model.lm_head.weight':
-            continue
         assert key in zero_dict, "{} not in ZeRO dictionary.".format(key)
         temp_zero_value = zero_dict[key].to(device=value.device, dtype=value.dtype)
         # debug_print([0], "max range: ", key, torch.max(torch.abs(value - temp_zero_value)))
diff --git a/tests/test_gemini/update/test_optim.py b/tests/test_gemini/update/test_optim.py
index 34509cc0cf00..07e6e65f2cd4 100644
--- a/tests/test_gemini/update/test_optim.py
+++ b/tests/test_gemini/update/test_optim.py
@@ -36,8 +36,6 @@ def check_param(model: ZeroDDP, torch_model: torch.nn.Module):
     for key, value in torch_dict.items():
         # key is 'module.model.PARAMETER', so we truncate it
         key = key[7:]
-        if key == 'model.lm_head.weight':
-            continue
         assert key in zero_dict, "{} not in ZeRO dictionary.".format(key)
         temp_zero_value = zero_dict[key].to(device=value.device, dtype=value.dtype)
         # debug_print([0], "max range: ", key, torch.max(torch.abs(value - temp_zero_value)))
diff --git a/tests/test_gemini/update/test_zeroddp_state_dict.py b/tests/test_gemini/update/test_zeroddp_state_dict.py
index 7b0c6e37a7e8..b902bb0f010e 100644
--- a/tests/test_gemini/update/test_zeroddp_state_dict.py
+++ b/tests/test_gemini/update/test_zeroddp_state_dict.py
@@ -45,8 +45,6 @@ def exam_state_dict(placement_policy, keep_gathered, model_name: str):
     torch_dict = torch_model.state_dict()
 
     for key, value in torch_dict.items():
-        if key == 'model.lm_head.weight':
-            continue
         assert key in zero_dict, "{} not in ZeRO dictionary.".format(key)
         temp_zero_value = zero_dict[key].to(device=value.device, dtype=value.dtype)
         assert torch.equal(value, temp_zero_value), "parameter '{}' has problem.".format(key)
@@ -84,8 +82,6 @@ def exam_load_state_dict(placement_policy, keep_gathered, model_name: str):
     zero_dict = model.state_dict(only_rank_0=False)
 
     for key, value in torch_dict.items():
-        if key == 'model.lm_head.weight':
-            continue
         assert key in zero_dict, "{} not in ZeRO dictionary.".format(key)
         temp_zero_value = zero_dict[key].to(device=value.device, dtype=value.dtype)
         assert torch.equal(value, temp_zero_value), "parameter '{}' has problem.".format(key)
diff --git a/tests/test_tensor/test_tp_with_zero.py b/tests/test_tensor/test_tp_with_zero.py
index 33db676cb85f..7e611e8a14f9 100644
--- a/tests/test_tensor/test_tp_with_zero.py
+++ b/tests/test_tensor/test_tp_with_zero.py
@@ -27,8 +27,6 @@ def check_param(model: ZeroDDP, torch_model: torch.nn.Module, pg: ProcessGroup):
     for key, value in torch_dict.items():
         # key is 'module.model.PARAMETER', so we truncate it
         key = key[7:]
-        if key == 'model.lm_head.weight':
-            continue
         assert key in zero_dict, "{} not in ZeRO dictionary.".format(key)
         temp_zero_value = zero_dict[key].to(device=value.device, dtype=value.dtype)
         # debug_print([0], "max range: ", key, torch.max(torch.abs(value - temp_zero_value)))

From 865f2e01965dc3381e16b908c0ce4e544d2fcda9 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 17:42:25 +0800
Subject: [PATCH 124/209] rename

---
 colossalai/autochunk/search_chunk.py | 2 +-
 colossalai/autochunk/trace_indice.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index eee357073f2f..590567045507 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -48,7 +48,7 @@ def __init__(self, gm, max_memory=None, print_mem=False) -> None:
         self.gm = gm
         self.print_mem = print_mem
         self.trace_indice = TraceIndice(list(gm.graph.nodes))
-        self.trace_indice.trace_index()
+        self.trace_indice.trace_indice()
         self.trace_flow = TraceFlow(self.trace_indice)
         self.reorder_graph = ReorderGraph(self.trace_indice)
         self.estimate_memory = EstimateMemory()
diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 71b5c16dc04d..9ad2649e7cdd 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -525,7 +525,7 @@ def _merge_equal_idx(self):
                         merge_to if i == merge_from else i for i in trace["indice"]
                     ]
 
-    def trace_index(self):
+    def trace_indice(self):
         for idx, node in enumerate(self.node_list):
             if node.op == "placeholder":
                 self._assign_all_indice(node, idx)

From d914a21d6405956b954c1cb47735356e3207635e Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 17:45:36 +0800
Subject: [PATCH 125/209] rename

---
 colossalai/autochunk/trace_indice.py | 31 ++--------------------------
 1 file changed, 2 insertions(+), 29 deletions(-)

diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 9ad2649e7cdd..a72fd775b9ed 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -111,21 +111,6 @@ def _mark_computation_from_node(self, node_from, node_to, exclude=None):
                 if j not in node_to_compute[i]:
                     node_to_compute[i].append(j)
 
-    def _mark_indice_equal(self, node1, dim1, node2, dim2):
-        """
-        Mark 2 indice to be equal.
-
-        Args:
-            idx1 (int): indice count.
-            idx2 (int): indice count.
-        """
-        # node1_idx = _find_idx_by_name(node1.name, self.nodes_list)
-        # node2_idx = _find_idx_by_name(node2.name, self.nodes_list)
-        # if node1_idx > node2_idx:
-        #     self._add_source(node2, dim2, node1, dim1)
-        # else:
-        #     self._add_source(node1, dim1, node2, dim2)
-
     def _mark_computation(self, node, idx, dim):
         """
         Mark some dims of node as computed.
@@ -273,19 +258,14 @@ def _assign_linear_indice(self, node, node_idx):
             node_idx (int)
         """
         if len(node.args) == 2:
-            input_node, weight = node.args
-            bias = None
+            _, weight = node.args
         else:
-            input_node, weight, bias = node.args
+            _, weight, _ = node.args
 
         self._assign_indice_as_input(node, node_idx)
         self._inherit_indice(weight, 1, node, -1)
 
         self._mark_computation(node, node_idx, [-1])
-        self._mark_indice_equal(input_node, -1, weight, 0)
-
-        if bias:
-            self._mark_indice_equal(input_node, -1, bias, 0)
 
     def _assign_matmul_indice(self, node, node_idx):
         """
@@ -306,7 +286,6 @@ def _assign_matmul_indice(self, node, node_idx):
 
         self._mark_computation_from_node(matmul_right, node, [-1, -2])
         self._mark_computation(node, node_idx, [-1])
-        self._mark_indice_equal(matmul_left, -1, matmul_right, -2)
 
     def _assign_layernorm_indice(self, node, idx):
         """
@@ -338,12 +317,6 @@ def _assign_elementwise_indice(self, node, idx):
                 nodes_in.append(node_in)
                 self._mark_computation_from_node(node_in, node)
         assert len(nodes_in) <= 2
-        if len(nodes_in) == 2:
-            node_in0_shape = get_node_shape(nodes_in[0])
-            node_in1_shape = get_node_shape(nodes_in[1])
-            for i in range(-1, -min(len(node_in0_shape), len(node_in1_shape)) - 1, -1):
-                if node_in0_shape[i] == node_in1_shape[i]:
-                    self._mark_indice_equal(nodes_in[0], i, nodes_in[1], i)
 
     def _assgin_no_change_indice(self, node, idx):
         self._assign_indice_as_input(node, idx)

From 0b6af554df09743cfd97245d0b4e9f7819b1764f Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 17:46:43 +0800
Subject: [PATCH 126/209] remove useless function

---
 colossalai/autochunk/trace_indice.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index a72fd775b9ed..0d09ed9f0e21 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -10,7 +10,6 @@ class TraceIndice(object):
     def __init__(self, node_list) -> None:
         self.node_list = node_list
         self.indice_trace_list = self._init_indice_trace_list()
-        self.indice_trace_equal = []
         self.indice_view_list = {}
         self.indice_count = -1
 
@@ -486,18 +485,6 @@ def _assign_view_reshape_indice(self, node, node_idx):
         }
         self.indice_view_list[node] = view_dict
 
-    def _merge_equal_idx(self):
-        idx_equal = copy.deepcopy(self.indice_trace_equal)
-        idx_equal.reverse()
-        for idx in idx_equal:
-            merge_to = min(idx)
-            merge_from = max(idx)
-            for trace in self.indice_trace_list:
-                if merge_from in trace["indice"]:
-                    trace["indice"] = [
-                        merge_to if i == merge_from else i for i in trace["indice"]
-                    ]
-
     def trace_indice(self):
         for idx, node in enumerate(self.node_list):
             if node.op == "placeholder":

From 53bb8682a2e5a0bfe3e3925d943f13ebc9df879d Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 9 Jan 2023 17:57:57 +0800
Subject: [PATCH 127/209] [worfklow] added coverage test (#2399)

* [worfklow] added coverage test

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code
---
 .github/workflows/build.yml        | 3 ++-
 .gitignore                         | 3 +++
 requirements/requirements-test.txt | 1 +
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 5366f69cc7b0..62d6350d6511 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -20,6 +20,7 @@ jobs:
       - uses: actions/checkout@v2
         with:
           fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.sha }}
       - name: Find the changed files
         id: find-changed-files
         uses: tj-actions/changed-files@v35
@@ -75,7 +76,7 @@ jobs:
 
       - name: Unit Testing
         run: |
-          PYTHONPATH=$PWD pytest tests
+          PYTHONPATH=$PWD pytest --cov=. --cov-report lcov tests
         env:
           DATA: /data/scratch/cifar-10
           NCCL_SHM_DISABLE: 1
diff --git a/.gitignore b/.gitignore
index 6b6f980e3392..8e345eeb8388 100644
--- a/.gitignore
+++ b/.gitignore
@@ -151,3 +151,6 @@ colossalai/version.py
 
 # ignore python interface defition file
 .pyi
+
+# ignore coverage test file
+converage.lcov
diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt
index f9e8960d2eaf..9ef0a682b6b8 100644
--- a/requirements/requirements-test.txt
+++ b/requirements/requirements-test.txt
@@ -1,5 +1,6 @@
 fbgemm-gpu==0.2.0
 pytest
+pytest-cov
 torchvision
 transformers
 timm

From 1be0ac3cbf3bd393c116b78ff64a9b7fea0c3fb8 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 9 Jan 2023 17:59:52 +0800
Subject: [PATCH 128/209] add doc for trace indice

---
 colossalai/autochunk/trace_indice.py | 31 +++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 0d09ed9f0e21..1e16ab9bdf35 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -1,13 +1,34 @@
 import copy
+from typing import Dict, List, Tuple
 
-from .utils import (
-    find_idx_by_name,
-    get_node_shape,
-)
+from torch.fx.node import Node
+
+from .utils import find_idx_by_name, get_node_shape
 
 
 class TraceIndice(object):
-    def __init__(self, node_list) -> None:
+    """
+    Trace all indice infomation for every node.
+
+    Indice is a logical concept. Equal dims can been treated as one indice.
+    eg. dim(x1) = [a, b, c]
+        dim(x2) = [d, e, f]
+        and we have x3 = x1 * x2.
+        then a=d, b=e, c=f, due to the broadcast property,
+        dim(x1)=dim(x2)=dim(x3)=[a, b, c]
+    This class will record every node's dims' indice, compute and source.
+
+    Attibutes:
+        node_list (List)
+        indice_trace_list (List): [{"indice": [...], "compute": [...], "source": [...]}, {...}]
+        indice_view_list (Dict): not used for now
+        indice_count (int): record indice number
+
+    Args:
+        node_list (List)
+    """
+
+    def __init__(self, node_list: List) -> None:
         self.node_list = node_list
         self.indice_trace_list = self._init_indice_trace_list()
         self.indice_view_list = {}

From 8de8de9fa3076e8da8e5a946d4b74f6985364bbb Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 10 Jan 2023 09:26:14 +0800
Subject: [PATCH 129/209] [docker] updated Dockerfile and release workflow
 (#2410)

---
 .github/workflows/release_docker.yml | 20 +++++++-------------
 docker/Dockerfile                    |  5 +++--
 2 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/release_docker.yml b/.github/workflows/release_docker.yml
index 328d232a8356..c72d3fb33edd 100644
--- a/.github/workflows/release_docker.yml
+++ b/.github/workflows/release_docker.yml
@@ -18,23 +18,17 @@ jobs:
         with:
           fetch-depth: 0
       - name: Build Docker
+        id: build
         run: |
           version=$(cat version.txt)
-          docker build --build-arg http_proxy=http://172.17.0.1:7890 --build-arg https_proxy=http://172.17.0.1:7890 -t hpcaitech/colossalai:$version ./docker
+          tag=hpcaitech/colossalai:$version
+          docker build --build-arg http_proxy=http://172.17.0.1:7890 --build-arg https_proxy=http://172.17.0.1:7890 -t $tag ./docker
+          echo "tag=${tag}" >> $GITHUB_OUTPUT
       - name: Log in to Docker Hub
         uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
         with:
           username: ${{ secrets.DOCKER_USERNAME }}
           password: ${{ secrets.DOCKER_PASSWORD }}
-      - name: Extract metadata (tags, labels) for Docker
-        id: meta
-        uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38
-        with:
-          images: hpcaitech/colossalai
-      - name: Build and push Docker image
-        uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
-        with:
-          context: .
-          push: true
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
+      - name: Push Docker image
+        run: |
+          docker push ${{ steps.build.outputs.tag }}
diff --git a/docker/Dockerfile b/docker/Dockerfile
index bcb7c0fffbb3..0faba17b9cee 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,17 +1,18 @@
 FROM hpcaitech/cuda-conda:11.3
 
 # install torch
-RUN conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch
+RUN conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch
 
 # install apex
 RUN git clone https://github.com/NVIDIA/apex && \
     cd apex && \
+    pip install packaging && \
     pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" ./
 
 # install colossalai
 RUN git clone https://github.com/hpcaitech/ColossalAI.git \
     && cd ./ColossalAI \
-    && pip install -v --no-cache-dir .
+    && CUDA_EXT=1 pip install -v --no-cache-dir .
 
 # install titans
 RUN pip install --no-cache-dir titans

From 7d4abaa5257758011f0f4ba1c5943f492e650a55 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 10 Jan 2023 09:59:47 +0800
Subject: [PATCH 130/209] add doc

---
 colossalai/autochunk/autochunk_codegen.py | 99 ++++++++++++++++++++---
 colossalai/autochunk/estimate_memory.py   | 22 ++++-
 colossalai/autochunk/reorder_graph.py     |  8 +-
 3 files changed, 113 insertions(+), 16 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 6e0cfb9cb2e7..73b6bf52460b 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -20,11 +20,22 @@
 from .utils import delete_free_var_from_last_use, find_idx_by_name, get_node_shape
 
 
-def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
+def _gen_chunk_slice_dim(chunk_dim: int, chunk_indice_name: str, shape: List) -> str:
+    """
+    Generate chunk slice string, eg. [:, :, chunk_idx_name:chunk_idx_name + chunk_size, :]
+
+    Args:
+        chunk_dim (int)
+        chunk_indice_name (str): chunk indice name
+        shape (List): node shape
+
+    Returns:
+        new_shape (str): return slice
+    """
     new_shape = "["
-    for idx, i in enumerate(shape):
+    for idx, _ in enumerate(shape):
         if idx == chunk_dim:
-            new_shape += "%s:%s + chunk_size" % (chunk_idx_name, chunk_idx_name)
+            new_shape += "%s:%s + chunk_size" % (chunk_indice_name, chunk_indice_name)
         else:
             new_shape += ":"
         new_shape += ", "
@@ -32,7 +43,26 @@ def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
     return new_shape
 
 
-def _gen_loop_start(chunk_input, chunk_output, chunk_ouput_dim, chunk_size=2):
+def _gen_loop_start(
+    chunk_input: List[Node], chunk_output: Node, chunk_ouput_dim: int, chunk_size=2
+) -> str:
+    """
+    Generate chunk loop start
+
+    eg. chunk_result = torch.empty([100, 100], dtype=input_node.dtype, device=input_node.device)
+        chunk_size = 32
+        for chunk_idx in range(0, 100, 32):
+            ......
+
+    Args:
+        chunk_input (List[Node]): chunk input node
+        chunk_output (Node): chunk output node
+        chunk_ouput_dim (int): chunk output node chunk dim
+        chunk_size (int): chunk size. Defaults to 2.
+
+    Returns:
+        context (str): generated str
+    """
     input_node = chunk_input[0]
     out_shape = get_node_shape(chunk_output)
     out_str = str(list(out_shape))
@@ -45,8 +75,28 @@ def _gen_loop_start(chunk_input, chunk_output, chunk_ouput_dim, chunk_size=2):
 
 
 def _gen_loop_end(
-    chunk_inputs, chunk_non_compute_inputs, chunk_outputs, chunk_outputs_dim, node_list
-):
+    chunk_inputs: List[Node],
+    chunk_non_compute_inputs: List[Node],
+    chunk_outputs: Node,
+    chunk_outputs_dim: int,
+    node_list: List[Node],
+) -> str:
+    """
+    Generate chunk loop end
+
+    eg.     chunk_result[chunk_idx:chunk_idx + chunk_size] = output_node
+        output_node = chunk_result; xx = None; xx = None
+
+    Args:
+        chunk_inputs (List[Node]): chunk input node
+        chunk_non_compute_inputs (List[Node]): input node without chunk
+        chunk_outputs (Node): chunk output node
+        chunk_outputs_dim (int): chunk output node chunk dim
+        node_list (List)
+
+    Returns:
+        context (str): generated str
+    """    
     chunk_outputs_name = chunk_outputs.name
     chunk_outputs_idx = find_idx_by_name(chunk_outputs_name, node_list)
     chunk_output_shape = chunk_outputs.meta["tensor_meta"].shape
@@ -76,7 +126,10 @@ def _gen_loop_end(
     return context
 
 
-def _replace_name(context, name_from, name_to):
+def _replace_name(context: str, name_from: str, name_to: str) -> str:
+    """
+    replace node name
+    """
     patterns = [(" ", " "), (" ", "."), (" ", ","), ("(", ")"), ("(", ","), (" ", ")")]
     for p in patterns:
         source = p[0] + name_from + p[1]
@@ -86,7 +139,10 @@ def _replace_name(context, name_from, name_to):
     return context
 
 
-def _replace_reshape_size(context, node_name, reshape_size_dict):
+def _replace_reshape_size(context: str, node_name: str, reshape_size_dict: Dict) -> str:
+    """
+    replace reshape size, some may have changed due to chunk
+    """
     if node_name not in reshape_size_dict:
         return context
     for size_name, size_value in reshape_size_dict[node_name].items():
@@ -94,7 +150,17 @@ def _replace_reshape_size(context, node_name, reshape_size_dict):
     return context
 
 
-def _replace_ones_like(search_chunk: SearchChunk, chunk_infos, region_idx, node_idx, node, body):
+def _replace_ones_like(
+    search_chunk: SearchChunk,
+    chunk_infos: List[Dict],
+    region_idx: int,
+    node_idx: int,
+    node: Node,
+    body: List[str],
+) -> List[str]:
+    """
+    add chunk slice for new tensor op such as ones like
+    """
     if "ones_like" in node.name:
         meta_node = search_chunk.trace_indice.node_list[node_idx]
         chunk_dim = chunk_infos[region_idx]["node_chunk_dim"][meta_node]["chunk_dim"]
@@ -114,7 +180,16 @@ def _replace_ones_like(search_chunk: SearchChunk, chunk_infos, region_idx, node_
     return body
 
 
-def _replace_input_var(chunk_inputs, region_idx, chunk_inputs_dim, node_idx, body):
+def _replace_input_node(
+    chunk_inputs: List[Node],
+    region_idx: int,
+    chunk_inputs_dim: Dict,
+    node_idx: int,
+    body: List[str],
+) -> List[str]:
+    """
+    add chunk slice for input nodes
+    """
     for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
         for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
             if idx == node_idx:
@@ -138,7 +213,7 @@ def emit_code_with_chunk(
     """
     Emit code with chunk according to chunk_infos.
 
-    It will generate a for loop in chunk regions, and 
+    It will generate a for loop in chunk regions, and
     replace inputs and outputs of regions with chunked variables.
 
     Args:
@@ -193,7 +268,7 @@ def emit_code_with_chunk(
         if within_chunk_region:
             emit_node_func(node, body)
             # replace input var with chunk var
-            body = _replace_input_var(
+            body = _replace_input_node(
                 chunk_inputs, region_idx, chunk_inputs_dim, node_idx, body
             )
             # ones like
diff --git a/colossalai/autochunk/estimate_memory.py b/colossalai/autochunk/estimate_memory.py
index 90cfd66a00d5..62b23cf9fc93 100644
--- a/colossalai/autochunk/estimate_memory.py
+++ b/colossalai/autochunk/estimate_memory.py
@@ -15,6 +15,10 @@
 
 
 class EstimateMemory(object):
+    """
+    Estimate memory with chunk
+    """
+
     def __init__(self) -> None:
         pass
 
@@ -31,8 +35,6 @@ def _get_output_node(self, n):
         }
         out_size = activation_size(fwd_out)
         out_node = [n.name] if out_size > 0 else []
-        # if any(i in n.name for i in ['transpose', 'permute', 'view']):
-        #     out_size = 0
         return out_size, out_node
 
     def _get_output_node_size(self, n):
@@ -184,10 +186,24 @@ def _print_compute_op_mem_log(self, log, nodes, title=None):
 
     def estimate_chunk_inference_mem(
         self,
-        node_list,
+        node_list: List,
         chunk_infos=None,
         print_mem=False,
     ):
+        """
+        Estimate inference memory with chunk
+
+        Args:
+            node_list (List): _description_
+            chunk_infos (Dict): Chunk information. Defaults to None.
+            print_mem (bool): Wether to print peak memory of every node. Defaults to False.
+
+        Returns:
+            act_memory_peak_log (List): peak memory of every node
+            act_memory_after_node_log (List): memory after excuting every node
+            active_node_list_log (List): active nodes of every node. active nodes refer to 
+                nodes generated but not deleted.
+        """
         act_memory = 0.0
         act_memory_peak_log = []
         act_memory_after_node_log = []
diff --git a/colossalai/autochunk/reorder_graph.py b/colossalai/autochunk/reorder_graph.py
index 2ece0126e91b..0343e52eedd6 100644
--- a/colossalai/autochunk/reorder_graph.py
+++ b/colossalai/autochunk/reorder_graph.py
@@ -3,6 +3,10 @@
 
 
 class ReorderGraph(object):
+    """
+    Reorder node list and indice trace list
+    """
+
     def __init__(self, trace_indice: TraceIndice) -> None:
         self.trace_indice = trace_indice
         self.all_reorder_map = {
@@ -60,7 +64,9 @@ def _reorder_self_node_list(self, reorder_map):
 
     def _reorder_idx_trace(self, reorder_map):
         # reorder list
-        new_idx_trace_list = [None for _ in range(len(self.trace_indice.indice_trace_list))]
+        new_idx_trace_list = [
+            None for _ in range(len(self.trace_indice.indice_trace_list))
+        ]
         for old_idx, new_idx in reorder_map.items():
             new_idx_trace_list[new_idx] = self.trace_indice.indice_trace_list[old_idx]
         self.trace_indice.indice_trace_list = new_idx_trace_list

From 615e7e68d9bc00e1b29879e73df4cca8afdd907d Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 10 Jan 2023 10:44:07 +0800
Subject: [PATCH 131/209] update doc

---
 colossalai/autochunk/search_chunk.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index 590567045507..c9e5e5172274 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -88,7 +88,7 @@ def _search_max_chunk_region(
         Args:
             active_node (List): active node status for every node
             peak_node (Node): peak memory node
-            chunk_regions (List): chunk region info
+            chunk_regions (List): chunk region infos
 
         Returns:
             chunk_region_start (int)

From a591d45b2994b02399dda171bd2e20723361b991 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 10 Jan 2023 10:56:39 +0800
Subject: [PATCH 132/209] add available

---
 colossalai/autochunk/autochunk_codegen.py | 524 +++++++++++-----------
 1 file changed, 268 insertions(+), 256 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 73b6bf52460b..1ee1d818a253 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -1,21 +1,25 @@
 from typing import Any, Dict, Iterable, List, Tuple
 
 import torch
-from torch.fx.graph import (
-    CodeGen,
-    PythonCode,
-    _custom_builtins,
-    _CustomBuiltin,
-    _format_target,
-    _is_from_torch,
-    _Namespace,
-    _origin_type_map,
-    inplace_methods,
-    magic_methods,
-)
-from torch.fx.node import Argument, Node, _get_qualified_name, _type_repr, map_arg
 
 import colossalai
+from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
+
+if CODEGEN_AVAILABLE:
+    from torch.fx.graph import (
+        CodeGen,
+        PythonCode,
+        _custom_builtins,
+        _CustomBuiltin,
+        _format_target,
+        _is_from_torch,
+        _Namespace,
+        _origin_type_map,
+        inplace_methods,
+        magic_methods,
+    )
+    from torch.fx.node import Argument, Node, _get_qualified_name, _type_repr, map_arg
+
 from .search_chunk import SearchChunk
 from .utils import delete_free_var_from_last_use, find_idx_by_name, get_node_shape
 
@@ -96,7 +100,7 @@ def _gen_loop_end(
 
     Returns:
         context (str): generated str
-    """    
+    """
     chunk_outputs_name = chunk_outputs.name
     chunk_outputs_idx = find_idx_by_name(chunk_outputs_name, node_list)
     chunk_output_shape = chunk_outputs.meta["tensor_meta"].shape
@@ -302,279 +306,287 @@ def emit_code_with_chunk(
         node_idx += 1
 
 
-class AutoChunkCodeGen(CodeGen):
-    def __init__(self, meta_graph, max_memory=None, print_mem=False):
-        super().__init__()
-        self.meta_graph = meta_graph
-        self.max_memory = max_memory
-        self.meta_node = list(meta_graph.graph.nodes)
-        # find the chunk regions
-        self.search_chunk = SearchChunk(meta_graph, max_memory, print_mem)
-        self.chunk_infos = self.search_chunk.search_region()
+if CODEGEN_AVAILABLE:
 
-    def _gen_python_code(
-        self, nodes, root_module: str, namespace: _Namespace
-    ) -> PythonCode:
-        free_vars: List[str] = []
-        body: List[str] = []
-        globals_: Dict[str, Any] = {}
-        wrapped_fns: Dict[str, None] = {}
+    class AutoChunkCodeGen(CodeGen):
+        def __init__(self, meta_graph, max_memory=None, print_mem=False):
+            super().__init__()
+            self.meta_graph = meta_graph
+            self.max_memory = max_memory
+            self.meta_node = list(meta_graph.graph.nodes)
+            # find the chunk regions
+            self.search_chunk = SearchChunk(meta_graph, max_memory, print_mem)
+            self.chunk_infos = self.search_chunk.search_region()
 
-        # Wrap string in list to pass by reference
-        maybe_return_annotation: List[str] = [""]
+        def _gen_python_code(
+            self, nodes, root_module: str, namespace: _Namespace
+        ) -> PythonCode:
+            free_vars: List[str] = []
+            body: List[str] = []
+            globals_: Dict[str, Any] = {}
+            wrapped_fns: Dict[str, None] = {}
 
-        def add_global(name_hint: str, obj: Any):
-            """Add an obj to be tracked as a global.
+            # Wrap string in list to pass by reference
+            maybe_return_annotation: List[str] = [""]
 
-            We call this for names that reference objects external to the
-            Graph, like functions or types.
+            def add_global(name_hint: str, obj: Any):
+                """Add an obj to be tracked as a global.
 
-            Returns: the global name that should be used to reference 'obj' in generated source.
-            """
-            if (
-                _is_from_torch(obj) and obj != torch.device
-            ):  # to support registering torch.device
-                # HACK: workaround for how torch custom ops are registered. We
-                # can't import them like normal modules so they must retain their
-                # fully qualified name.
-                return _get_qualified_name(obj)
-
-            # normalize the name hint to get a proper identifier
-            global_name = namespace.create_name(name_hint, obj)
-
-            if global_name in globals_:
-                assert globals_[global_name] is obj
+                We call this for names that reference objects external to the
+                Graph, like functions or types.
+
+                Returns: the global name that should be used to reference 'obj' in generated source.
+                """
+                if (
+                    _is_from_torch(obj) and obj != torch.device
+                ):  # to support registering torch.device
+                    # HACK: workaround for how torch custom ops are registered. We
+                    # can't import them like normal modules so they must retain their
+                    # fully qualified name.
+                    return _get_qualified_name(obj)
+
+                # normalize the name hint to get a proper identifier
+                global_name = namespace.create_name(name_hint, obj)
+
+                if global_name in globals_:
+                    assert globals_[global_name] is obj
+                    return global_name
+                globals_[global_name] = obj
                 return global_name
-            globals_[global_name] = obj
-            return global_name
 
-        # set _custom_builtins here so that we needn't import colossalai in forward
-        _custom_builtins["colossalai"] = _CustomBuiltin("import colossalai", colossalai)
+            # set _custom_builtins here so that we needn't import colossalai in forward
+            _custom_builtins["colossalai"] = _CustomBuiltin(
+                "import colossalai", colossalai
+            )
+
+            # Pre-fill the globals table with registered builtins.
+            for name, (_, obj) in _custom_builtins.items():
+                add_global(name, obj)
 
-        # Pre-fill the globals table with registered builtins.
-        for name, (_, obj) in _custom_builtins.items():
-            add_global(name, obj)
+            def type_repr(o: Any):
+                if o == ():
+                    # Empty tuple is used for empty tuple type annotation Tuple[()]
+                    return "()"
 
-        def type_repr(o: Any):
-            if o == ():
-                # Empty tuple is used for empty tuple type annotation Tuple[()]
-                return "()"
+                typename = _type_repr(o)
 
-            typename = _type_repr(o)
+                if hasattr(o, "__origin__"):
+                    # This is a generic type, e.g. typing.List[torch.Tensor]
+                    origin_type = _origin_type_map.get(o.__origin__, o.__origin__)
+                    origin_typename = add_global(_type_repr(origin_type), origin_type)
 
-            if hasattr(o, "__origin__"):
-                # This is a generic type, e.g. typing.List[torch.Tensor]
-                origin_type = _origin_type_map.get(o.__origin__, o.__origin__)
-                origin_typename = add_global(_type_repr(origin_type), origin_type)
+                    if hasattr(o, "__args__"):
+                        # Assign global names for each of the inner type variables.
+                        args = [type_repr(arg) for arg in o.__args__]
 
-                if hasattr(o, "__args__"):
-                    # Assign global names for each of the inner type variables.
-                    args = [type_repr(arg) for arg in o.__args__]
+                        if len(args) == 0:
+                            # Bare type, such as `typing.Tuple` with no subscript
+                            # This code-path used in Python < 3.9
+                            return origin_typename
 
-                    if len(args) == 0:
+                        return f'{origin_typename}[{",".join(args)}]'
+                    else:
                         # Bare type, such as `typing.Tuple` with no subscript
-                        # This code-path used in Python < 3.9
+                        # This code-path used in Python 3.9+
                         return origin_typename
 
-                    return f'{origin_typename}[{",".join(args)}]'
+                # Common case: this is a regular module name like 'foo.bar.baz'
+                return add_global(typename, o)
+
+            def _format_args(
+                args: Tuple[Argument, ...], kwargs: Dict[str, Argument]
+            ) -> str:
+                def _get_repr(arg):
+                    # Handle NamedTuples (if it has `_fields`) via add_global.
+                    if isinstance(arg, tuple) and hasattr(arg, "_fields"):
+                        qualified_name = _get_qualified_name(type(arg))
+                        global_name = add_global(qualified_name, type(arg))
+                        return f"{global_name}{repr(tuple(arg))}"
+                    return repr(arg)
+
+                args_s = ", ".join(_get_repr(a) for a in args)
+                kwargs_s = ", ".join(f"{k} = {_get_repr(v)}" for k, v in kwargs.items())
+                if args_s and kwargs_s:
+                    return f"{args_s}, {kwargs_s}"
+                return args_s or kwargs_s
+
+            # Run through reverse nodes and record the first instance of a use
+            # of a given node. This represents the *last* use of the node in the
+            # execution order of the program, which we will use to free unused
+            # values
+            node_to_last_use: Dict[Node, Node] = {}
+            user_to_last_uses: Dict[Node, List[Node]] = {}
+
+            def register_last_uses(n: Node, user: Node):
+                if n not in node_to_last_use:
+                    node_to_last_use[n] = user
+                    user_to_last_uses.setdefault(user, []).append(n)
+
+            for node in reversed(nodes):
+                map_arg(node.args, lambda n: register_last_uses(n, node))
+                map_arg(node.kwargs, lambda n: register_last_uses(n, node))
+
+            delete_free_var_from_last_use(user_to_last_uses)
+
+            # NOTE: we add a variable to distinguish body and ckpt_func
+            def delete_unused_values(user: Node, body, to_keep=[]):
+                """
+                Delete values after their last use. This ensures that values that are
+                not used in the remainder of the code are freed and the memory usage
+                of the code is optimal.
+                """
+                if user.op == "placeholder":
+                    return
+                if user.op == "output":
+                    body.append("\n")
+                    return
+                nodes_to_delete = user_to_last_uses.get(user, [])
+                nodes_to_delete = [i for i in nodes_to_delete if i.name not in to_keep]
+                if len(nodes_to_delete):
+                    to_delete_str = " = ".join(
+                        [repr(n) for n in nodes_to_delete] + ["None"]
+                    )
+                    body.append(f";  {to_delete_str}\n")
                 else:
-                    # Bare type, such as `typing.Tuple` with no subscript
-                    # This code-path used in Python 3.9+
-                    return origin_typename
-
-            # Common case: this is a regular module name like 'foo.bar.baz'
-            return add_global(typename, o)
-
-        def _format_args(
-            args: Tuple[Argument, ...], kwargs: Dict[str, Argument]
-        ) -> str:
-            def _get_repr(arg):
-                # Handle NamedTuples (if it has `_fields`) via add_global.
-                if isinstance(arg, tuple) and hasattr(arg, "_fields"):
-                    qualified_name = _get_qualified_name(type(arg))
-                    global_name = add_global(qualified_name, type(arg))
-                    return f"{global_name}{repr(tuple(arg))}"
-                return repr(arg)
-
-            args_s = ", ".join(_get_repr(a) for a in args)
-            kwargs_s = ", ".join(f"{k} = {_get_repr(v)}" for k, v in kwargs.items())
-            if args_s and kwargs_s:
-                return f"{args_s}, {kwargs_s}"
-            return args_s or kwargs_s
-
-        # Run through reverse nodes and record the first instance of a use
-        # of a given node. This represents the *last* use of the node in the
-        # execution order of the program, which we will use to free unused
-        # values
-        node_to_last_use: Dict[Node, Node] = {}
-        user_to_last_uses: Dict[Node, List[Node]] = {}
-
-        def register_last_uses(n: Node, user: Node):
-            if n not in node_to_last_use:
-                node_to_last_use[n] = user
-                user_to_last_uses.setdefault(user, []).append(n)
-
-        for node in reversed(nodes):
-            map_arg(node.args, lambda n: register_last_uses(n, node))
-            map_arg(node.kwargs, lambda n: register_last_uses(n, node))
-
-        delete_free_var_from_last_use(user_to_last_uses)
-
-        # NOTE: we add a variable to distinguish body and ckpt_func
-        def delete_unused_values(user: Node, body, to_keep=[]):
-            """
-            Delete values after their last use. This ensures that values that are
-            not used in the remainder of the code are freed and the memory usage
-            of the code is optimal.
-            """
-            if user.op == "placeholder":
-                return
-            if user.op == "output":
-                body.append("\n")
-                return
-            nodes_to_delete = user_to_last_uses.get(user, [])
-            nodes_to_delete = [i for i in nodes_to_delete if i.name not in to_keep]
-            if len(nodes_to_delete):
-                to_delete_str = " = ".join(
-                    [repr(n) for n in nodes_to_delete] + ["None"]
-                )
-                body.append(f";  {to_delete_str}\n")
-            else:
-                body.append("\n")
+                    body.append("\n")
 
-        # NOTE: we add a variable to distinguish body and ckpt_func
-        def emit_node(node: Node, body):
-            maybe_type_annotation = (
-                "" if node.type is None else f" : {type_repr(node.type)}"
-            )
-            if node.op == "placeholder":
-                assert isinstance(node.target, str)
-                maybe_default_arg = "" if not node.args else f" = {repr(node.args[0])}"
-                free_vars.append(
-                    f"{node.target}{maybe_type_annotation}{maybe_default_arg}"
+            # NOTE: we add a variable to distinguish body and ckpt_func
+            def emit_node(node: Node, body):
+                maybe_type_annotation = (
+                    "" if node.type is None else f" : {type_repr(node.type)}"
                 )
-                raw_name = node.target.replace("*", "")
-                if raw_name != repr(node):
-                    body.append(f"{repr(node)} = {raw_name}\n")
-                return
-            elif node.op == "call_method":
-                assert isinstance(node.target, str)
-                body.append(
-                    f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}"
-                    f"({_format_args(node.args[1:], node.kwargs)})"
-                )
-                return
-            elif node.op == "call_function":
-                assert callable(node.target)
-                # pretty print operators
-                if (
-                    node.target.__module__ == "_operator"
-                    and node.target.__name__ in magic_methods
-                ):
-                    assert isinstance(node.args, tuple)
+                if node.op == "placeholder":
+                    assert isinstance(node.target, str)
+                    maybe_default_arg = (
+                        "" if not node.args else f" = {repr(node.args[0])}"
+                    )
+                    free_vars.append(
+                        f"{node.target}{maybe_type_annotation}{maybe_default_arg}"
+                    )
+                    raw_name = node.target.replace("*", "")
+                    if raw_name != repr(node):
+                        body.append(f"{repr(node)} = {raw_name}\n")
+                    return
+                elif node.op == "call_method":
+                    assert isinstance(node.target, str)
                     body.append(
-                        f"{repr(node)}{maybe_type_annotation} = "
-                        f"{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}"
+                        f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}"
+                        f"({_format_args(node.args[1:], node.kwargs)})"
                     )
                     return
-
-                # pretty print inplace operators; required for jit.script to work properly
-                # not currently supported in normal FX graphs, but generated by torchdynamo
-                if (
-                    node.target.__module__ == "_operator"
-                    and node.target.__name__ in inplace_methods
-                ):
+                elif node.op == "call_function":
+                    assert callable(node.target)
+                    # pretty print operators
+                    if (
+                        node.target.__module__ == "_operator"
+                        and node.target.__name__ in magic_methods
+                    ):
+                        assert isinstance(node.args, tuple)
+                        body.append(
+                            f"{repr(node)}{maybe_type_annotation} = "
+                            f"{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}"
+                        )
+                        return
+
+                    # pretty print inplace operators; required for jit.script to work properly
+                    # not currently supported in normal FX graphs, but generated by torchdynamo
+                    if (
+                        node.target.__module__ == "_operator"
+                        and node.target.__name__ in inplace_methods
+                    ):
+                        body.append(
+                            f"{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  "
+                            f"{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}"
+                        )
+                        return
+
+                    qualified_name = _get_qualified_name(node.target)
+                    global_name = add_global(qualified_name, node.target)
+                    # special case for getattr: node.args could be 2-argument or 3-argument
+                    # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
+                    if (
+                        global_name == "getattr"
+                        and isinstance(node.args, tuple)
+                        and isinstance(node.args[1], str)
+                        and node.args[1].isidentifier()
+                        and len(node.args) == 2
+                    ):
+                        body.append(
+                            f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}"
+                        )
+                        return
                     body.append(
-                        f"{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  "
-                        f"{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}"
+                        f"{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})"
                     )
+                    if node.meta.get("is_wrapped", False):
+                        wrapped_fns.setdefault(global_name)
                     return
-
-                qualified_name = _get_qualified_name(node.target)
-                global_name = add_global(qualified_name, node.target)
-                # special case for getattr: node.args could be 2-argument or 3-argument
-                # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
-                if (
-                    global_name == "getattr"
-                    and isinstance(node.args, tuple)
-                    and isinstance(node.args[1], str)
-                    and node.args[1].isidentifier()
-                    and len(node.args) == 2
-                ):
+                elif node.op == "call_module":
+                    assert isinstance(node.target, str)
                     body.append(
-                        f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}"
+                        f"{repr(node)}{maybe_type_annotation} = "
+                        f"{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})"
                     )
                     return
-                body.append(
-                    f"{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})"
-                )
-                if node.meta.get("is_wrapped", False):
-                    wrapped_fns.setdefault(global_name)
-                return
-            elif node.op == "call_module":
-                assert isinstance(node.target, str)
-                body.append(
-                    f"{repr(node)}{maybe_type_annotation} = "
-                    f"{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})"
-                )
-                return
-            elif node.op == "get_attr":
-                assert isinstance(node.target, str)
-                body.append(
-                    f"{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}"
+                elif node.op == "get_attr":
+                    assert isinstance(node.target, str)
+                    body.append(
+                        f"{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}"
+                    )
+                    return
+                elif node.op == "output":
+                    if node.type is not None:
+                        maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
+                    body.append(self.generate_output(node.args[0]))
+                    return
+                raise NotImplementedError(f"node: {node.op} {node.target}")
+
+            # Modified for activation checkpointing
+            ckpt_func = []
+
+            # if any node has a list of labels for activation_checkpoint, we
+            # will use nested type of activation checkpoint codegen
+            emit_code_with_chunk(
+                body,
+                nodes,
+                emit_node,
+                delete_unused_values,
+                self.search_chunk,
+                self.chunk_infos,
+            )
+
+            if len(body) == 0:
+                # If the Graph has no non-placeholder nodes, no lines for the body
+                # have been emitted. To continue to have valid Python code, emit a
+                # single pass statement
+                body.append("pass\n")
+
+            if len(wrapped_fns) > 0:
+                wrap_name = add_global("wrap", torch.fx.wrap)
+                wrap_stmts = "\n".join(
+                    [f'{wrap_name}("{name}")' for name in wrapped_fns]
                 )
-                return
-            elif node.op == "output":
-                if node.type is not None:
-                    maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
-                body.append(self.generate_output(node.args[0]))
-                return
-            raise NotImplementedError(f"node: {node.op} {node.target}")
-
-        # Modified for activation checkpointing
-        ckpt_func = []
-
-        # if any node has a list of labels for activation_checkpoint, we
-        # will use nested type of activation checkpoint codegen
-        emit_code_with_chunk(
-            body,
-            nodes,
-            emit_node,
-            delete_unused_values,
-            self.search_chunk,
-            self.chunk_infos,
-        )
-
-        if len(body) == 0:
-            # If the Graph has no non-placeholder nodes, no lines for the body
-            # have been emitted. To continue to have valid Python code, emit a
-            # single pass statement
-            body.append("pass\n")
-
-        if len(wrapped_fns) > 0:
-            wrap_name = add_global("wrap", torch.fx.wrap)
-            wrap_stmts = "\n".join([f'{wrap_name}("{name}")' for name in wrapped_fns])
-        else:
-            wrap_stmts = ""
+            else:
+                wrap_stmts = ""
 
-        if self._body_transformer:
-            body = self._body_transformer(body)
+            if self._body_transformer:
+                body = self._body_transformer(body)
 
-        for name, value in self.additional_globals():
-            add_global(name, value)
+            for name, value in self.additional_globals():
+                add_global(name, value)
 
-        # as we need colossalai.utils.checkpoint, we need to import colossalai
-        # in forward function
-        prologue = self.gen_fn_def(free_vars, maybe_return_annotation[0])
-        prologue = "".join(ckpt_func) + prologue
-        prologue = prologue
+            # as we need colossalai.utils.checkpoint, we need to import colossalai
+            # in forward function
+            prologue = self.gen_fn_def(free_vars, maybe_return_annotation[0])
+            prologue = "".join(ckpt_func) + prologue
+            prologue = prologue
 
-        code = "".join(body)
-        code = "\n".join("    " + line for line in code.split("\n"))
-        fn_code = f"""
-{wrap_stmts}
+            code = "".join(body)
+            code = "\n".join("    " + line for line in code.split("\n"))
+            fn_code = f"""
+    {wrap_stmts}
 
-{prologue}
-{code}"""
-        # print(fn_code)
-        return PythonCode(fn_code, globals_)
+    {prologue}
+    {code}"""
+            # print(fn_code)
+            return PythonCode(fn_code, globals_)

From fd818cf14423489714cd3fb19c703a5b40271e17 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 10 Jan 2023 11:10:45 +0800
Subject: [PATCH 133/209] change imports

---
 colossalai/autochunk/autochunk_codegen.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 1ee1d818a253..14f17b1d37ba 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -18,7 +18,8 @@
         inplace_methods,
         magic_methods,
     )
-    from torch.fx.node import Argument, Node, _get_qualified_name, _type_repr, map_arg
+
+from torch.fx.node import Argument, Node, _get_qualified_name, _type_repr, map_arg
 
 from .search_chunk import SearchChunk
 from .utils import delete_free_var_from_last_use, find_idx_by_name, get_node_shape

From c1492e5013709e49093e497c3b7a6ec4bb10b9d4 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 10 Jan 2023 11:20:28 +0800
Subject: [PATCH 134/209] add test in import

---
 tests/test_autochunk/test_autochunk_codegen.py | 4 +++-
 tests/test_autochunk/test_autochunk_search.py  | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/test_autochunk/test_autochunk_codegen.py b/tests/test_autochunk/test_autochunk_codegen.py
index 53f62077c07a..28999706b20a 100644
--- a/tests/test_autochunk/test_autochunk_codegen.py
+++ b/tests/test_autochunk/test_autochunk_codegen.py
@@ -6,7 +6,6 @@
 import torch.multiprocessing as mp
 
 import colossalai
-from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
 from colossalai.core import global_context as gpc
 from colossalai.fx import ColoTracer
 from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
@@ -16,6 +15,9 @@
 from colossalai.utils import free_port
 from tests.test_autochunk.evoformer.evoformer import evoformer_base
 
+if CODEGEN_AVAILABLE:
+    from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
+
 
 def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
     # for memory test
diff --git a/tests/test_autochunk/test_autochunk_search.py b/tests/test_autochunk/test_autochunk_search.py
index 5026c3ad3b3d..eb2bf4560e2c 100644
--- a/tests/test_autochunk/test_autochunk_search.py
+++ b/tests/test_autochunk/test_autochunk_search.py
@@ -6,7 +6,6 @@
 import torch.multiprocessing as mp
 
 import colossalai
-from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
 from colossalai.core import global_context as gpc
 from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
@@ -14,6 +13,9 @@
 from colossalai.utils import free_port
 from tests.test_autochunk.evoformer.evoformer import evoformer_base
 
+if CODEGEN_AVAILABLE:
+    from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
+
 
 def assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len):
     found_regions = [i["region"] for i in chunk_infos]

From 8327932d2c2e2169422c8e9428983f780c55983d Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 10 Jan 2023 11:26:19 +0800
Subject: [PATCH 135/209] [workflow] refactored the example check workflow
 (#2411)

* [workflow] refactored the example check workflow

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code
---
 ...eekly_check.yml => auto_example_check.yml} | 75 +++++++++++--------
 ...example.yml => dispatch_example_check.yml} | 44 +++++------
 .../example_checks/check_dispatch_inputs.py   | 27 +++++++
 .../check_example_weekly.py}                  |  9 +--
 .../detect_changed_example.py}                | 11 ++-
 .../workflows/scripts/input_check_example.py  | 23 ------
 examples/tutorial/hybrid_parallel/config.py   |  4 +-
 .../tutorial/hybrid_parallel/requirements.txt |  1 +
 examples/tutorial/hybrid_parallel/test_ci.sh  |  5 ++
 examples/tutorial/hybrid_parallel/train.py    |  6 +-
 10 files changed, 113 insertions(+), 92 deletions(-)
 rename .github/workflows/{changed_file_trigger_examples_check_and_weekly_check.yml => auto_example_check.yml} (62%)
 rename .github/workflows/{workflow_dispatch_example.yml => dispatch_example_check.yml} (57%)
 create mode 100644 .github/workflows/scripts/example_checks/check_dispatch_inputs.py
 rename .github/workflows/scripts/{weekly_check_example.py => example_checks/check_example_weekly.py} (76%)
 rename .github/workflows/scripts/{changed_example.py => example_checks/detect_changed_example.py} (52%)
 delete mode 100644 .github/workflows/scripts/input_check_example.py
 create mode 100644 examples/tutorial/hybrid_parallel/test_ci.sh

diff --git a/.github/workflows/changed_file_trigger_examples_check_and_weekly_check.yml b/.github/workflows/auto_example_check.yml
similarity index 62%
rename from .github/workflows/changed_file_trigger_examples_check_and_weekly_check.yml
rename to .github/workflows/auto_example_check.yml
index 2b7ec31252e4..7f1e357e33e8 100644
--- a/.github/workflows/changed_file_trigger_examples_check_and_weekly_check.yml
+++ b/.github/workflows/auto_example_check.yml
@@ -1,7 +1,7 @@
 name: Test Example
 on:
   pull_request:
-    # So only the changes in examples folder will trigger jobs below.
+    # any change in the examples folder will trigger check for the corresponding example.
     paths:
       - 'examples/**'
   # run at 00:00 of every Sunday(singapore time) so here is UTC time Saturday 16:00
@@ -17,12 +17,14 @@ jobs:
         github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request'
     runs-on: ubuntu-latest
     outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-    name: Check out all files
+      matrix: ${{ steps.setup-matrix.outputs.matrix }}
+      anyChanged: ${{ steps.setup-matrix.outputs.anyChanged }}
+    name: Detect changed example files
     steps:
       - uses: actions/checkout@v3
         with:
-          fetch-depth: 2
+          fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.sha }}
       - name: Get all changed example files
         id: changed-files
         uses: tj-actions/changed-files@v35
@@ -30,46 +32,53 @@ jobs:
         with:
           since_last_remote_commit: true
       - name: setup matrix
-        id: set-matrix
+        id: setup-matrix
         run: |
           changedFileName=""
           for file in ${{ steps.changed-files.outputs.all_changed_files  }}; do
             changedFileName="${file}:${changedFileName}"
           done
           echo "$changedFileName was changed"
-          res=`python .github/workflows/scripts/changed_example.py --fileNameList $changedFileName`
-          echo "All changed files are $res"
-          loc=$( IFS=',' ; echo "${res[*]}" )
-          echo "$loc"
-          echo "::set-output name=matrix::{\"loc\":$(echo "$loc")}"
+          res=`python .github/workflows/scripts/example_checks/detect_changed_example.py --fileNameList $changedFileName`
+          echo "All changed examples are $res"
+
+          if [ "$x" = "[]" ]; then
+            echo "anyChanged=false" >> $GITHUB_OUTPUT
+            echo "matrix=null" >> $GITHUB_OUTPUT
+          else
+            dirs=$( IFS=',' ; echo "${res[*]}" )
+            echo "anyChanged=true" >> $GITHUB_OUTPUT
+            echo "matrix={\"directory\":$(echo "$dirs")}" >> $GITHUB_OUTPUT
+          fi
 
   # If no file is changed, it will prompt an error and shows the matrix do not have value.
-  check-all-changed-files:
+  check-changed-example:
     # Add this condition to avoid executing this job if the trigger event is workflow_dispatch.
     if: |
         github.event.pull_request.draft == false &&
         github.base_ref == 'main' &&
         github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request'
-    name: Test each changed example files
+    name: Test the changed example
     needs: detect-changed-example
     runs-on: [self-hosted, gpu]
     strategy:
       matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}}
     container:
       image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --gpus all --rm -v /data/scratch/examples-data:/data/
+    timeout-minutes: 10
     steps:
       - uses: actions/checkout@v3
-        with:
-          fetch-depth: 2
-      - name: Install dependancies
+      - name: Install Colossal-AI
         run: |
-          pip install -r ./requirements/requirements.txt
-          pip install colossalai
-      - name: List all changed example files
+          pip install -v .
+      - name: Test the example
         run: |
-          res=${{ matrix.loc }}
-          cd "${PWD}/examples/${res}"
+          example_dir=${{ matrix.directory }}
+          cd "${PWD}/examples/${example_dir}"
           bash test_ci.sh
+        env:
+          NCCL_SHM_DISABLE: 1
 
   # This is for all files' weekly check. Specifically, this job is to find all the directories.
   matrix_preparation:
@@ -77,20 +86,20 @@ jobs:
         github.event.pull_request.draft == false &&
         github.base_ref == 'main' &&
         github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'schedule'
-    name: Prepare Directory List for All files
+    name: Prepare matrix for weekly check
     runs-on: ubuntu-latest
     outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
+      matrix: ${{ steps.setup-matrix.outputs.matrix }}
     steps:
     - name: 📚 Checkout
       uses: actions/checkout@v3
     - name: setup matrix
-      id: set-matrix
+      id: setup-matrix
       run: |
-        res=`python .github/workflows/scripts/weekly_check_example.py`
+        res=`python .github/workflows/scripts/example_checks/check_example_weekly.py`
         all_loc=$( IFS=',' ; echo "${res[*]}" )
-        echo "$all_loc"
-        echo "::set-output name=matrix::{\"all_loc\":$(echo "$all_loc")}"
+        echo "Found the examples: $all_loc"
+        echo "matrix={\"directory\":$(echo "$all_loc")}" >> $GITHUB_OUTPUT
 
   weekly_check:
     if: |
@@ -104,16 +113,18 @@ jobs:
       matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
     container:
       image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+    timeout-minutes: 10
     steps:
       - name: 📚 Checkout
         uses: actions/checkout@v3
-      - name: Install the requirements
+      - name: Install Colossal-AI
         run: |
-          pip install -r ./requirements/requirements.txt
-          pip install colossalai
+          pip install -v .
       - name: Traverse all files
         run: |
-          dir=${{ matrix.all_loc }}
-          echo "${dir} is current directory"
-          cd "${PWD}/examples/${dir}"
+          example_dir=${{ matrix.diretory }}
+          echo "Testing ${example_dir} now"
+          cd "${PWD}/examples/${example_dir}"
           bash test_ci.sh
+        env:
+          NCCL_SHM_DISABLE: 1
diff --git a/.github/workflows/workflow_dispatch_example.yml b/.github/workflows/dispatch_example_check.yml
similarity index 57%
rename from .github/workflows/workflow_dispatch_example.yml
rename to .github/workflows/dispatch_example_check.yml
index d9d5769109a3..e0333422f50d 100644
--- a/.github/workflows/workflow_dispatch_example.yml
+++ b/.github/workflows/dispatch_example_check.yml
@@ -8,7 +8,7 @@ on:
         required: true
 
 jobs:
-  manual_check_matrix_preparation:
+  matrix_preparation:
     if: |
         github.event.pull_request.draft == false &&
         github.base_ref == 'main' &&
@@ -16,31 +16,24 @@ jobs:
     name: Check the examples user want
     runs-on: ubuntu-latest
     outputs:
-      matrix: ${{ steps.set-matrix-1.outputs.matrix }}
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
     steps:
     - name: 📚 Checkout
       uses: actions/checkout@v3
-    - name: Get manual directories
-      id: set-matrix-1
+    - name: Set up matrix
+      id: set-matrix
       env:
         check_dir: ${{ inputs.example_directory }}
       run: |
-        all_mannual_check_dir=()
-        for cdi in $check_dir
-        do
-          all_mannual_check_dir+=("\"${cdi}\"")
-        done
-        man_loc=$( IFS=',' ; echo "${all_mannual_check_dir[*]}" )
-        res=`python .github/workflows/scripts/input_check_example.py --fileNameList $man_loc`
-        echo "${res} is file existance. 1 for all exist, -1 for at least one file not exist."
-        if [ res == -1 ];then
-           exit(1)
+        res=`python .github/workflows/scripts/example_checks/check_dispatch_inputs.py --fileNameList $check_dir`
+        if [ res == "failure" ];then
+          exit -1
         fi
-        man_loc="[${man_loc}]"
-        echo "$man_loc"
-        echo "::set-output name=matrix::{\"man_loc\":$(echo "$man_loc")}"
+        dirs="[${check_dir}]"
+        echo "Testing examples in $dirs"
+        echo "matrix={\"directory\":$(echo "$dirs")}" >> $GITHUB_OUTPUT
 
-  manual_check:
+  test_example:
     if: |
         github.event.pull_request.draft == false &&
         github.base_ref == 'main' &&
@@ -52,16 +45,19 @@ jobs:
       matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}}
     container:
       image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --gpus all --rm -v /data/scratch/examples-data:/data/
+    timeout-minutes: 10
     steps:
       - name: 📚 Checkout
         uses: actions/checkout@v3
-      - name: Install the requirements
+      - name: Install Colossal-AI
         run: |
-          pip install -r ./requirements/requirements.txt
-          pip install colossalai
-      - name: Traverse all files
+          pip install -v .
+      - name: Test the example
         run: |
-          dir=${{ matrix.man_loc }}
-          echo "${dir} is current directory"
+          dir=${{ matrix.directory }}
+          echo "Testing ${dir} now"
           cd "${PWD}/examples/${dir}"
           bash test_ci.sh
+        env:
+          NCCL_SHM_DISABLE: 1
diff --git a/.github/workflows/scripts/example_checks/check_dispatch_inputs.py b/.github/workflows/scripts/example_checks/check_dispatch_inputs.py
new file mode 100644
index 000000000000..04d2063ec5fc
--- /dev/null
+++ b/.github/workflows/scripts/example_checks/check_dispatch_inputs.py
@@ -0,0 +1,27 @@
+import argparse
+import os
+
+
+def check_inputs(input_list):
+    for path in input_list:
+        real_path = os.path.join('examples', path)
+        if not os.path.exists(real_path):
+            return False
+    return True
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f', '--fileNameList', type=str, help="List of file names")
+    args = parser.parse_args()
+    name_list = args.fileNameList.split(",")
+    is_correct = check_inputs(name_list)
+
+    if is_correct:
+        print('success')
+    else:
+        print('failure')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/.github/workflows/scripts/weekly_check_example.py b/.github/workflows/scripts/example_checks/check_example_weekly.py
similarity index 76%
rename from .github/workflows/scripts/weekly_check_example.py
rename to .github/workflows/scripts/example_checks/check_example_weekly.py
index dfedc46287f2..941e90901f3d 100644
--- a/.github/workflows/scripts/weekly_check_example.py
+++ b/.github/workflows/scripts/example_checks/check_example_weekly.py
@@ -5,9 +5,9 @@ def show_files(path, all_files):
     # Traverse all the folder/file in current directory
     file_list = os.listdir(path)
     # Determine the element is folder or file. If file, pass it into list, if folder, recurse.
-    for file in file_list:
+    for file_name in file_list:
         # Get the abs directory using os.path.join() and store into cur_path.
-        cur_path = os.path.join(path, file)
+        cur_path = os.path.join(path, file_name)
         # Determine whether folder
         if os.path.isdir(cur_path):
             show_files(cur_path, all_files)
@@ -26,9 +26,8 @@ def main():
     for file_loc in contents:
         split_loc = file_loc.split('/')
         # must have two sub-folder levels after examples folder, such as examples/images/vit is acceptable, examples/images/README.md is not, examples/requirements.txt is not.
-        if len(split_loc) - split_loc.index('examples') >= 3:
-            tmp_loc = split_loc[(split_loc.index('examples') + 1):(split_loc.index('examples') + 3)]
-            re_loc = join(tmp_loc, '/')
+        if len(split_loc) >= 4:
+            re_loc = '/'.join(split_loc[1:3])
             if re_loc not in all_loc:
                 all_loc.append(re_loc)
     print(all_loc)
diff --git a/.github/workflows/scripts/changed_example.py b/.github/workflows/scripts/example_checks/detect_changed_example.py
similarity index 52%
rename from .github/workflows/scripts/changed_example.py
rename to .github/workflows/scripts/example_checks/detect_changed_example.py
index ac2f0864eb72..df4fd67368fc 100644
--- a/.github/workflows/scripts/changed_example.py
+++ b/.github/workflows/scripts/example_checks/detect_changed_example.py
@@ -3,14 +3,19 @@
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--fileNameList', type=str)
+    parser.add_argument('-f', '--fileNameList', type=str, help="The list of changed files")
     args = parser.parse_args()
     name_list = args.fileNameList.split(":")
     folder_need_check = set()
     for loc in name_list:
-        # Find only the sub-folder of 'example' folder
+        # Find only the sub-sub-folder of 'example' folder
+        # the examples folder structure is like
+        # - examples
+        #   - area
+        #     - application
+        #       - file
         if loc.split("/")[0] == "examples" and len(loc.split("/")) >= 4:
-            folder_need_check.add(loc.split("/")[1] + "/" + loc.split("/")[2])
+            folder_need_check.add('/'.join(loc.split("/")[1:3]))
     # Output the result using print. Then the shell can get the values.
     print(list(folder_need_check))
 
diff --git a/.github/workflows/scripts/input_check_example.py b/.github/workflows/scripts/input_check_example.py
deleted file mode 100644
index 5602d8f0904a..000000000000
--- a/.github/workflows/scripts/input_check_example.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import argparse
-import os
-
-
-def detect_correct(loc_li):
-    for loc in loc_li:
-        real_loc = 'examples/' + eval(loc)
-        if not os.path.exists(real_loc):
-            return -1
-    return 1
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--fileNameList', type=str)
-    args = parser.parse_args()
-    name_list = args.fileNameList.split(",")
-    result = detect_correct(name_list)
-    print(result)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/tutorial/hybrid_parallel/config.py b/examples/tutorial/hybrid_parallel/config.py
index 2450ab1c7a72..ac273c305006 100644
--- a/examples/tutorial/hybrid_parallel/config.py
+++ b/examples/tutorial/hybrid_parallel/config.py
@@ -6,8 +6,8 @@
 BATCH_SIZE = 256
 LEARNING_RATE = 3e-3
 WEIGHT_DECAY = 0.3
-NUM_EPOCHS = 10
-WARMUP_EPOCHS = 3
+NUM_EPOCHS = 2
+WARMUP_EPOCHS = 1
 
 # model config
 IMG_SIZE = 224
diff --git a/examples/tutorial/hybrid_parallel/requirements.txt b/examples/tutorial/hybrid_parallel/requirements.txt
index 137a69e80498..dbf6aaf3e4e2 100644
--- a/examples/tutorial/hybrid_parallel/requirements.txt
+++ b/examples/tutorial/hybrid_parallel/requirements.txt
@@ -1,2 +1,3 @@
 colossalai >= 0.1.12
 torch >= 1.8.1
+titans
\ No newline at end of file
diff --git a/examples/tutorial/hybrid_parallel/test_ci.sh b/examples/tutorial/hybrid_parallel/test_ci.sh
new file mode 100644
index 000000000000..8860b72a2fb3
--- /dev/null
+++ b/examples/tutorial/hybrid_parallel/test_ci.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+set -euxo pipefail
+
+pip install -r requirements.txt
+torchrun --standalone --nproc_per_node 4 train.py --config config.py -s
diff --git a/examples/tutorial/hybrid_parallel/train.py b/examples/tutorial/hybrid_parallel/train.py
index 0f2a207cb172..2a8576db747b 100644
--- a/examples/tutorial/hybrid_parallel/train.py
+++ b/examples/tutorial/hybrid_parallel/train.py
@@ -98,9 +98,9 @@ def main():
     root = os.environ.get('DATA', '../data')
     if args.synthetic:
         # if we use synthetic dataset
-        # we train for 30 steps and eval for 10 steps per epoch
-        train_dataloader = DummyDataloader(length=30, batch_size=gpc.config.BATCH_SIZE)
-        test_dataloader = DummyDataloader(length=10, batch_size=gpc.config.BATCH_SIZE)
+        # we train for 10 steps and eval for 5 steps per epoch
+        train_dataloader = DummyDataloader(length=10, batch_size=gpc.config.BATCH_SIZE)
+        test_dataloader = DummyDataloader(length=5, batch_size=gpc.config.BATCH_SIZE)
     else:
         train_dataloader, test_dataloader = build_cifar(gpc.config.BATCH_SIZE, root, pad_if_needed=True)
 

From 7d5640b9db01b501e95b66e91be9fe27b58d2e58 Mon Sep 17 00:00:00 2001
From: Haofan Wang <haofanwang.ai@gmail.com>
Date: Tue, 10 Jan 2023 11:27:23 +0800
Subject: [PATCH 136/209] Update parallel_context.py (#2408)

---
 colossalai/context/parallel_context.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/context/parallel_context.py b/colossalai/context/parallel_context.py
index dd12dad6d347..b7338b53ddde 100644
--- a/colossalai/context/parallel_context.py
+++ b/colossalai/context/parallel_context.py
@@ -375,7 +375,7 @@ def init_global_dist(self, rank: int, world_size: int, backend: str, host: str,
 
         # None will give the default global process group for pytorch dist operations
         ranks = list(range(world_size))
-        cpu_group = dist.new_group(ranks, backend='gloo') if dist.get_backend() != 'gloo' else None
+        cpu_group = dist.new_group(ranks, backend='gloo') if dist.get_backend() == 'gloo' else None
         self._register_dist(rank, world_size, dist.GroupMember.WORLD, cpu_group, ranks, ParallelMode.GLOBAL)
         self.add_global_rank(ParallelMode.GLOBAL, rank)
 

From d84e7479750f820040ca53ca8bbf4589ae6f645c Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Tue, 10 Jan 2023 11:39:25 +0800
Subject: [PATCH 137/209] [hotfix] add DISTPAN argument for benchmark (#2412)

* change the benchmark config file

* change config

* revert config file

* rename distpan to distplan
---
 examples/language/gpt/gemini/benchmark_gemini.sh | 6 +++---
 examples/language/gpt/gemini/run_gemini.sh       | 8 ++++----
 examples/language/gpt/gemini/train_gpt_demo.py   | 2 ++
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/examples/language/gpt/gemini/benchmark_gemini.sh b/examples/language/gpt/gemini/benchmark_gemini.sh
index 464ea03da7eb..9a630b2ffe23 100644
--- a/examples/language/gpt/gemini/benchmark_gemini.sh
+++ b/examples/language/gpt/gemini/benchmark_gemini.sh
@@ -1,5 +1,5 @@
 for MODEL_TYPE in "gpt2_medium"; do
-  for DISPAN in "colossalai"; do
+  for DISTPLAN in "colossalai"; do
     for BATCH_SIZE in 16; do
       for GPUNUM in 1 2 4 8; do
         for TPDEGREE in 1 2 4 8; do
@@ -8,8 +8,8 @@ for MODEL_TYPE in "gpt2_medium"; do
           fi
           for PLACEMENT in "cpu" "auto"; do
             echo "****************** Begin ***************************"
-            echo "+ benchmrking MODEL ${MODEL_TYPE} DISPAN ${DISPAN} GPU ${GPUNUM} BS ${BATCH_SIZE} TP ${TPDEGREE} POLICY ${PLACEMENT}"
-            MODEL_TYPE=${MODEL_TYPE} DISPAN=${DISPAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \
+            echo "+ benchmrking MODEL ${MODEL_TYPE} DISTPLAN ${DISTPLAN} GPU ${GPUNUM} BS ${BATCH_SIZE} TP ${TPDEGREE} POLICY ${PLACEMENT}"
+            MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \
             bash ./run_gemini.sh
             echo "****************** Finished ***************************"
             echo ""
diff --git a/examples/language/gpt/gemini/run_gemini.sh b/examples/language/gpt/gemini/run_gemini.sh
index ad577c350d39..0c2ea660f1e0 100644
--- a/examples/language/gpt/gemini/run_gemini.sh
+++ b/examples/language/gpt/gemini/run_gemini.sh
@@ -1,8 +1,8 @@
 set -x
 # distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"]
-export DISTPAN=${DISTPAN:-"colossalai"}
+export DISTPLAN=${DISTPLAN:-"colossalai"}
 
-# The following options only valid when DISTPAN="colossalai"
+# The following options only valid when DISTPLAN="colossalai"
 export GPUNUM=${GPUNUM:-1}
 export TPDEGREE=${TPDEGREE:-1}
 export PLACEMENT=${PLACEMENT:-"cpu"}
@@ -20,5 +20,5 @@ torchrun --standalone --nproc_per_node=${GPUNUM} ./train_gpt_demo.py \
 --batch_size=${BATCH_SIZE} \
 --placement=${PLACEMENT} \
 --shardinit=${USE_SHARD_INIT} \
---distplan=${DISTPAN} \
-2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log
+--distplan=${DISTPLAN} \
+2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log
diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py
index 891b1de15af1..92cb7393c37b 100644
--- a/examples/language/gpt/gemini/train_gpt_demo.py
+++ b/examples/language/gpt/gemini/train_gpt_demo.py
@@ -290,9 +290,11 @@ def main():
             from torch.distributed.optim import ZeroRedundancyOptimizer
             optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=torch.optim.Adam, lr=0.01)
     elif args.distplan.startswith("zero"):
+        model = model.half()
         partition_flag = args.distplan == "zero2"
         optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
         optimizer = LowLevelZeroOptimizer(optimizer,
+                                          reduce_bucket_size=12 * 1024 * 1024,
                                           overlap_communication=True,
                                           partition_grad=partition_flag,
                                           verbose=True)

From 4befaabace567589251a0c5ba2916a7fc891bcfa Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 10 Jan 2023 11:40:04 +0800
Subject: [PATCH 138/209] [workflow] added precommit check for code consistency
 (#2401)

* [workflow] added precommit check for code consistency

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code
---
 .github/workflows/pre_commit.yml | 46 ++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 .github/workflows/pre_commit.yml

diff --git a/.github/workflows/pre_commit.yml b/.github/workflows/pre_commit.yml
new file mode 100644
index 000000000000..128802629ce6
--- /dev/null
+++ b/.github/workflows/pre_commit.yml
@@ -0,0 +1,46 @@
+name: pre-commit
+
+on:
+  pull_request:
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+      with:
+          fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.sha }}
+
+    - name: Find the changed files
+      id: find-changed-files
+      uses: tj-actions/changed-files@v35
+      with:
+        since_last_remote_commit: true
+
+    - name: List all changed files
+      run: |
+        for file in ${{ steps.find-changed-files.outputs.all_changed_files }}; do
+          echo "$file was changed"
+        done
+
+    - uses: actions/setup-python@v3
+
+    - name: Cache pre-commit hooks
+      uses: actions/cache@v3
+      with:
+        path: ~/.cache/pre-commit
+        key: ${{ runner.os }}-pre-commit-hooks
+
+    - name: Set up pre-commit
+      run: |
+        pip install pre-commit
+        pre-commit install
+
+    - name: Run pre-commit on Changed Files
+      id: precommit
+      run: |
+        for file in ${{ steps.find-changed-files.outputs.all_changed_files }}; do
+          echo "======= running pre-commit on ${file} ======="
+          pre-commit run --files $file
+        done

From 7ab2db206f0342b3f69ea9f6cc25813363c00f56 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 10 Jan 2023 11:56:00 +0800
Subject: [PATCH 139/209] adapt new fx

---
 colossalai/autochunk/autochunk_codegen.py      | 6 +++---
 colossalai/autochunk/estimate_memory.py        | 7 +------
 tests/test_autochunk/test_autochunk_codegen.py | 6 ++++--
 tests/test_autochunk/test_autochunk_search.py  | 7 ++++---
 4 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index 14f17b1d37ba..e8af9bde86d8 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -585,9 +585,9 @@ def emit_node(node: Node, body):
             code = "".join(body)
             code = "\n".join("    " + line for line in code.split("\n"))
             fn_code = f"""
-    {wrap_stmts}
+{wrap_stmts}
 
-    {prologue}
-    {code}"""
+{prologue}
+{code}"""
             # print(fn_code)
             return PythonCode(fn_code, globals_)
diff --git a/colossalai/autochunk/estimate_memory.py b/colossalai/autochunk/estimate_memory.py
index 62b23cf9fc93..e001423f1fbb 100644
--- a/colossalai/autochunk/estimate_memory.py
+++ b/colossalai/autochunk/estimate_memory.py
@@ -28,12 +28,7 @@ def _get_meta_node_size(self, x):
         return x
 
     def _get_output_node(self, n):
-        fwd_out = {
-            x.uuid: x
-            for x in n.meta["fwd_out"]
-            if isinstance(x, torch.Tensor) and hasattr(x, "uuid")
-        }
-        out_size = activation_size(fwd_out)
+        out_size = activation_size(n.meta["fwd_out"])
         out_node = [n.name] if out_size > 0 else []
         return out_size, out_node
 
diff --git a/tests/test_autochunk/test_autochunk_codegen.py b/tests/test_autochunk/test_autochunk_codegen.py
index 28999706b20a..fe19168842ad 100644
--- a/tests/test_autochunk/test_autochunk_codegen.py
+++ b/tests/test_autochunk/test_autochunk_codegen.py
@@ -8,6 +8,7 @@
 import colossalai
 from colossalai.core import global_context as gpc
 from colossalai.fx import ColoTracer
+from colossalai.fx._compatibility import is_compatible_with_meta
 from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
 from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
@@ -15,8 +16,9 @@
 from colossalai.utils import free_port
 from tests.test_autochunk.evoformer.evoformer import evoformer_base
 
-if CODEGEN_AVAILABLE:
+if CODEGEN_AVAILABLE and is_compatible_with_meta():
     from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
+    from colossalai.fx.profiler import MetaTensor
 
 
 def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
@@ -102,7 +104,7 @@ def _test_autochunk_codegen(rank, msa_len, pair_len, max_memory):
     gpc.destroy()
 
 
-@pytest.mark.skipif(not CODEGEN_AVAILABLE, reason='torch version is lower than 1.12.0')
+@pytest.mark.skipif(not (CODEGEN_AVAILABLE and is_compatible_with_meta()), reason='torch version is lower than 1.12.0')
 @pytest.mark.parametrize("max_memory", [None, 20, 25, 30])
 @pytest.mark.parametrize("msa_len", [32])
 @pytest.mark.parametrize("pair_len", [64])
diff --git a/tests/test_autochunk/test_autochunk_search.py b/tests/test_autochunk/test_autochunk_search.py
index eb2bf4560e2c..537bf4f4170d 100644
--- a/tests/test_autochunk/test_autochunk_search.py
+++ b/tests/test_autochunk/test_autochunk_search.py
@@ -7,14 +7,15 @@
 
 import colossalai
 from colossalai.core import global_context as gpc
+from colossalai.fx._compatibility import is_compatible_with_meta
 from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
-from colossalai.fx.profiler import MetaTensor
 from colossalai.utils import free_port
 from tests.test_autochunk.evoformer.evoformer import evoformer_base
 
-if CODEGEN_AVAILABLE:
+if CODEGEN_AVAILABLE and is_compatible_with_meta():
     from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
+    from colossalai.fx.profiler import MetaTensor
 
 
 def assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len):
@@ -89,7 +90,7 @@ def _test_autochunk_search(rank, msa_len, pair_len, max_memory):
     gpc.destroy()
 
 
-@pytest.mark.skipif(not CODEGEN_AVAILABLE, reason="torch version is lower than 1.12.0")
+@pytest.mark.skipif(not (CODEGEN_AVAILABLE and is_compatible_with_meta()), reason="torch version is lower than 1.12.0")
 @pytest.mark.parametrize("max_memory", [None, 20, 25, 30])
 @pytest.mark.parametrize("msa_len", [32])
 @pytest.mark.parametrize("pair_len", [64])

From 9d432230ba0c006efeaf0d448e2cbac409f88f60 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 10 Jan 2023 12:06:01 +0800
Subject: [PATCH 140/209] [workflow] added translation for non-english comments
 (#2414)

---
 .github/workflows/translate_comment.yml | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 .github/workflows/translate_comment.yml

diff --git a/.github/workflows/translate_comment.yml b/.github/workflows/translate_comment.yml
new file mode 100644
index 000000000000..83c127b3caa4
--- /dev/null
+++ b/.github/workflows/translate_comment.yml
@@ -0,0 +1,18 @@
+name: 'issue-translator'
+on:
+  issue_comment:
+    types: [created]
+  issues:
+    types: [opened]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: usthe/issues-translate-action@v2.7
+        with:
+          IS_MODIFY_TITLE: false
+          # not require, default false, . Decide whether to modify the issue title
+          # if true, the robot account @Issues-translate-bot must have modification permissions, invite @Issues-translate-bot to your project or use your custom bot.
+          CUSTOM_BOT_NOTE: Bot detected the issue body's language is not English, translate it automatically. 👯👭🏻🧑‍🤝‍🧑👫🧑🏿‍🤝‍🧑🏻👩🏾‍🤝‍👨🏿👬🏿
+          # not require. Customize the translation robot prefix message.

From 2445279a084163dfd4ff009a43df7dc185319bd5 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 10 Jan 2023 12:10:13 +0800
Subject: [PATCH 141/209] [setup] refactored setup.py for dependency graph
 (#2413)

---
 setup.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 5128b80e880d..b9cd9e5e4714 100644
--- a/setup.py
+++ b/setup.py
@@ -144,13 +144,16 @@ def get_version():
         print(f'===== Building Extension {name} =====')
         ext_modules.append(builder_cls().builder())
 
-if is_nightly:
+# always put not nightly branch as the if branch
+# otherwise github will treat colossalai-nightly as the project name
+# and it will mess up with the dependency graph insights
+if not is_nightly:
+    version = get_version()
+    package_name = 'colossalai'
+else:
     # use date as the nightly version
     version = datetime.today().strftime('%Y.%m.%d')
     package_name = 'colossalai-nightly'
-else:
-    version = get_version()
-    package_name = 'colossalai'
 
 setup(name=package_name,
       version=version,

From 36ab2cb783fdee9899f734f99633fc0d63d4e980 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 10 Jan 2023 12:20:40 +0800
Subject: [PATCH 142/209] change import

---
 tests/test_autochunk/test_autochunk_codegen.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_autochunk/test_autochunk_codegen.py b/tests/test_autochunk/test_autochunk_codegen.py
index fe19168842ad..a061e0ce10d3 100644
--- a/tests/test_autochunk/test_autochunk_codegen.py
+++ b/tests/test_autochunk/test_autochunk_codegen.py
@@ -12,7 +12,6 @@
 from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
 from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
-from colossalai.fx.profiler import MetaTensor
 from colossalai.utils import free_port
 from tests.test_autochunk.evoformer.evoformer import evoformer_base
 

From 61fdd3464af8225c7f674386ad1d358ed26ebbaa Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Tue, 10 Jan 2023 12:29:09 +0800
Subject: [PATCH 143/209] update doc

---
 .../test_autochunk/test_autochunk_codegen.py  | 28 +++++++------------
 tests/test_autochunk/test_autochunk_search.py | 28 ++++++++-----------
 2 files changed, 21 insertions(+), 35 deletions(-)

diff --git a/tests/test_autochunk/test_autochunk_codegen.py b/tests/test_autochunk/test_autochunk_codegen.py
index a061e0ce10d3..02fa07e2ca00 100644
--- a/tests/test_autochunk/test_autochunk_codegen.py
+++ b/tests/test_autochunk/test_autochunk_codegen.py
@@ -40,20 +40,16 @@ def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
         non_fx_out = model(node, pair)
         fx_out = gm(node, pair)
 
-    assert torch.allclose(
-        non_fx_out[0], fx_out[0], atol=1e-4
-    ), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
-        torch.abs(non_fx_out[0] - fx_out[0])
-    )
-    assert torch.allclose(
-        non_fx_out[1], fx_out[1], atol=1e-4
-    ), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
-        torch.abs(non_fx_out[1] - fx_out[1])
-    )
+    assert torch.allclose(non_fx_out[0], fx_out[0],
+                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
+                              torch.abs(non_fx_out[0] - fx_out[0]))
+    assert torch.allclose(non_fx_out[1], fx_out[1],
+                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
+                              torch.abs(non_fx_out[1] - fx_out[1]))
 
 
 def _test_autochunk_codegen(rank, msa_len, pair_len, max_memory):
-    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
+    # launch colossalai
     colossalai.launch(
         config={},
         rank=rank,
@@ -76,18 +72,14 @@ def _test_autochunk_codegen(rank, msa_len, pair_len, max_memory):
             "pair": pair.to(torch.device("meta")),
         },
     )
-    gm_prop = torch.fx.symbolic_trace(model)  # must use symbolic_trace
+    gm_prop = torch.fx.symbolic_trace(model)    # must use symbolic_trace
     interp = MetaInfoProp(gm_prop)
-    interp.propagate(
-        MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
-    )
+    interp.propagate(MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0"))
 
     # now run it twice to get meta info in graph module, not necessary
     gm = torch.fx.GraphModule(model, graph)
     interp = MetaInfoProp(gm)
-    interp.propagate(
-        MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
-    )
+    interp.propagate(MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0"))
 
     codegen = AutoChunkCodeGen(gm_prop, max_memory=max_memory)
     graph.set_codegen(codegen)
diff --git a/tests/test_autochunk/test_autochunk_search.py b/tests/test_autochunk/test_autochunk_search.py
index 537bf4f4170d..371fce64fdf7 100644
--- a/tests/test_autochunk/test_autochunk_search.py
+++ b/tests/test_autochunk/test_autochunk_search.py
@@ -23,7 +23,8 @@ def assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len):
 
     if msa_len == 32 and pair_len == 64:
         if max_memory is None:
-            target_regions = [(142, 154), (366, 373), (233, 283), (301, 351), (127, 134), (204, 228), (167, 191), (161, 166), (198, 203), (6, 69)]
+            target_regions = [(142, 154), (366, 373), (233, 283), (301, 351), (127, 134), (204, 228), (167, 191),
+                              (161, 166), (198, 203), (6, 69)]
         elif max_memory == 20:
             target_regions = [(142, 154), (369, 373), (233, 269), (301, 351)]
         elif max_memory == 25:
@@ -36,24 +37,19 @@ def assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len):
         raise NotImplementedError()
 
     assert len(found_regions) == len(
-        target_regions
-    ), "len of found regions %s doesn't equal len of target regions %s" % (
-        str(found_regions),
-        str(target_regions),
-    )
+        target_regions), "len of found regions %s doesn't equal len of target regions %s" % (
+            str(found_regions),
+            str(target_regions),
+        )
     for region in target_regions:
-        assert (
-            region in found_regions
-        ), "region:%s not in found regions for msa:%d, pair:%d, maxmem:%d" % (
+        assert (region in found_regions), "region:%s not in found regions for msa:%d, pair:%d, maxmem:%d" % (
             str(region),
             msa_len,
             pair_len,
             max_memory,
         )
     for region in found_regions:
-        assert (
-            region in target_regions
-        ), "region:%s should not be found for msa:%d, pair:%d, maxmem:%d" % (
+        assert (region in target_regions), "region:%s should not be found for msa:%d, pair:%d, maxmem:%d" % (
             str(region),
             msa_len,
             pair_len,
@@ -62,7 +58,7 @@ def assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len):
 
 
 def _test_autochunk_search(rank, msa_len, pair_len, max_memory):
-    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
+    # launch colossalai
     colossalai.launch(
         config={},
         rank=rank,
@@ -77,11 +73,9 @@ def _test_autochunk_search(rank, msa_len, pair_len, max_memory):
     node = torch.randn(1, msa_len, pair_len, 256).cuda()
     pair = torch.randn(1, pair_len, pair_len, 128).cuda()
 
-    gm_prop = torch.fx.symbolic_trace(model)  # must use symbolic_trace
+    gm_prop = torch.fx.symbolic_trace(model)    # must use symbolic_trace
     interp = MetaInfoProp(gm_prop)
-    interp.propagate(
-        MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
-    )
+    interp.propagate(MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0"))
 
     codegen = AutoChunkCodeGen(gm_prop, max_memory=max_memory)
     chunk_infos = codegen.chunk_infos

From 57b6157b6ca25dbda89f5f67d84cf363b911eecf Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 10 Jan 2023 15:06:27 +0800
Subject: [PATCH 144/209] [workflow] auto comment if precommit check fails
 (#2417)

---
 .github/workflows/auto_example_check.yml |  3 --
 .github/workflows/build.yml              |  1 -
 .github/workflows/comment.yml            | 67 ++++++++++++++++++++++++
 .github/workflows/pre_commit.yml         | 15 +++++-
 4 files changed, 80 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/comment.yml

diff --git a/.github/workflows/auto_example_check.yml b/.github/workflows/auto_example_check.yml
index 7f1e357e33e8..d9063bad9f33 100644
--- a/.github/workflows/auto_example_check.yml
+++ b/.github/workflows/auto_example_check.yml
@@ -28,9 +28,6 @@ jobs:
       - name: Get all changed example files
         id: changed-files
         uses: tj-actions/changed-files@v35
-        # Using this can trigger action each time a PR is submitted.
-        with:
-          since_last_remote_commit: true
       - name: setup matrix
         id: setup-matrix
         run: |
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 62d6350d6511..25c8a395734a 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -25,7 +25,6 @@ jobs:
         id: find-changed-files
         uses: tj-actions/changed-files@v35
         with:
-          since_last_remote_commit: true
           files: |
             op_builder/**
             colossalai/kernel/**
diff --git a/.github/workflows/comment.yml b/.github/workflows/comment.yml
new file mode 100644
index 000000000000..9f873bad7ae0
--- /dev/null
+++ b/.github/workflows/comment.yml
@@ -0,0 +1,67 @@
+name: Auto Workflow Comment
+
+on:
+  workflow_run:
+    workflows: [pre-commit]
+    types:
+      - completed
+
+jobs:
+  # comment with a message on how to do pre-commit
+  # if the pre-commit check was not passed
+  report-precommit-failure:
+    runs-on: ubuntu-latest
+    if: ${{ github.event.workflow_run.name }} == "pre-commit" && ${{ github.event.workflow_run.conclusion == 'failure' }}
+    steps:
+      - name: 'Download artifact'
+        uses: actions/github-script@v6
+        with:
+          script: |
+            let allArtifacts = await github.rest.actions.listWorkflowRunArtifacts({
+               owner: context.repo.owner,
+               repo: context.repo.repo,
+               run_id: context.payload.workflow_run.id,
+            });
+            let matchArtifact = allArtifacts.data.artifacts.filter((artifact) => {
+              return artifact.name == "pr_number"
+            })[0];
+            let download = await github.rest.actions.downloadArtifact({
+               owner: context.repo.owner,
+               repo: context.repo.repo,
+               artifact_id: matchArtifact.id,
+               archive_format: 'zip',
+            });
+            let fs = require('fs');
+            fs.writeFileSync(`${process.env.GITHUB_WORKSPACE}/pr_number.zip`, Buffer.from(download.data));
+
+      - name: 'Unzip artifact'
+        run: unzip pr_number.zip
+
+      - name: 'Comment on PR'
+        uses: actions/github-script@v6
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            let fs = require('fs');
+            let issue_number = Number(fs.readFileSync('./pr_number'));
+            let owner = context.repo.owner;
+            let repo = context.repo.repo;
+            let run_id = context.payload.workflow_run.id;
+            let run_url = `https://github.com/${owner}/${repo}/actions/runs/${run_id}`
+            let body = `
+            Your pre-commit check failed, follow the steps to run pre-commit on your file for code style consistency.
+
+            1. install pre-commit via "pip install pre-commit"
+            2. install pre-commit hooks via "pre-commit install"
+            3. run pre-commit on file with format error via "pre-commit run --files path" by replacing "path" with the actual file path
+            4. commit and push to your branch
+
+            View your job at ${run_url}.
+            Read our "CONTRIBUTING.md" for more reference to the code style.
+            `;
+            await github.rest.issues.createComment({
+              owner: owner,
+              repo: repo,
+              issue_number: issue_number,
+              body: body
+            });
diff --git a/.github/workflows/pre_commit.yml b/.github/workflows/pre_commit.yml
index 128802629ce6..113f50ee0569 100644
--- a/.github/workflows/pre_commit.yml
+++ b/.github/workflows/pre_commit.yml
@@ -15,8 +15,6 @@ jobs:
     - name: Find the changed files
       id: find-changed-files
       uses: tj-actions/changed-files@v35
-      with:
-        since_last_remote_commit: true
 
     - name: List all changed files
       run: |
@@ -44,3 +42,16 @@ jobs:
           echo "======= running pre-commit on ${file} ======="
           pre-commit run --files $file
         done
+
+    - name: Save PR number
+      if: always()
+      env:
+        PR_NUMBER: ${{ github.event.number }}
+      run: |
+        mkdir -p ./pr
+        echo $PR_NUMBER > ./pr/pr_number
+    - uses: actions/upload-artifact@v3
+      if: always()
+      with:
+        name: pr_number
+        path: pr/

From dddacd2d2c4d2416563fa4160d715d11a9a2a691 Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Tue, 10 Jan 2023 15:43:06 +0800
Subject: [PATCH 145/209] [hotfix] add norm clearing for the overflow step
 (#2416)

---
 colossalai/nn/optimizer/zero_optimizer.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/colossalai/nn/optimizer/zero_optimizer.py b/colossalai/nn/optimizer/zero_optimizer.py
index 2786d4496a8e..7f9d2fe8fc97 100644
--- a/colossalai/nn/optimizer/zero_optimizer.py
+++ b/colossalai/nn/optimizer/zero_optimizer.py
@@ -140,6 +140,10 @@ def _check_overflow(self):
 
         return self._found_overflow.item() > 0
 
+    def _clear_global_norm(self) -> None:
+        for c16 in self.chunk16_set:
+            c16.l2_norm = None
+
     def _calc_global_norm(self) -> float:
         norm_sqr: float = 0.0
         group_to_norm = dict()
@@ -201,6 +205,7 @@ def step(self, *args, **kwargs):
             self.optim_state = OptimState.UNSCALED    # no need to unscale grad
             self.grad_scaler.update(found_inf)    # update gradient scaler
             self._logger.info(f'Found overflow. Skip step')
+            self._clear_global_norm()    # clear recorded norm
             self.zero_grad()    # reset all gradients
             self._update_fp16_params()
             return

From fe0f7970a21cce04c8e014b72ff7df8c91742643 Mon Sep 17 00:00:00 2001
From: ZijianYY <119492445+ZijianYY@users.noreply.github.com>
Date: Tue, 10 Jan 2023 16:18:56 +0800
Subject: [PATCH 146/209] [examples] adding tflops to PaLM (#2365)

---
 examples/language/palm/train.py | 49 +++++++++++++++++++++++++++------
 1 file changed, 41 insertions(+), 8 deletions(-)

diff --git a/examples/language/palm/train.py b/examples/language/palm/train.py
index 7c080b7f321d..6725c07dfac7 100644
--- a/examples/language/palm/train.py
+++ b/examples/language/palm/train.py
@@ -1,9 +1,11 @@
 import gzip
 import random
-
+from time import time
+from functools import partial
 import numpy as np
 import torch
 import torch.optim as optim
+import torch.nn as nn
 import tqdm
 from packaging import version
 from palm_pytorch import PaLM
@@ -21,7 +23,8 @@
 
 # constants
 
-NUM_BATCHES = int(1000)
+NUM_BATCHES = int(100)
+WARMUP_BATCHES = 1
 GRADIENT_ACCUMULATE_EVERY = 1
 LEARNING_RATE = 2e-4
 VALIDATE_EVERY = 100
@@ -76,10 +79,18 @@ def cycle(loader):
 def decode_token(token):
     return str(chr(max(32, token)))
 
+def get_tflops(model_numel, batch_size, seq_len, step_time):
+    return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12)
 
 def decode_tokens(tokens):
     return "".join(list(map(decode_token, tokens)))
 
+def get_model_size(model: nn.Module):
+    total_numel = 0
+    for module in model.modules():
+        for p in module.parameters(recurse=False):
+            total_numel += p.numel()
+    return total_numel
 
 # Gemini + ZeRO DDP
 def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"):
@@ -143,7 +154,6 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
                 split_param_row_tp1d(param, pg)    # row slice
             else:
                 param.set_dist_spec(ReplicaSpec())
-
             param.visited = True
 
 
@@ -152,6 +162,7 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
         raise TypeError(f"{args.distplan} is error")
 disable_existing_loggers()
 colossalai.launch_from_torch(config={})
+logger = get_dist_logger()
 
 with gzip.open("./data/enwik8.gz") as file:
     X = np.fromstring(file.read(int(95e6)), dtype=np.uint8)
@@ -188,7 +199,7 @@ def __len__(self):
     ctx = ColoInitContext(device='cpu', default_dist_spec=default_dist_spec, default_pg=default_pg)
 
     with ctx:
-        model = PaLM(num_tokens=256, dim=512, depth=8)
+        model = PaLM(num_tokens=50304, dim=4096, depth=64)
         model = AutoregressiveWrapper(model, max_seq_len=SEQ_LEN)
 
     pg = default_pg
@@ -205,25 +216,42 @@ def __len__(self):
     model.cuda()
     optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
 
-
+ # model is shared after TP
+numel = get_model_size(model)
+get_tflops_func = partial(get_tflops, numel, args.batch_size, SEQ_LEN)
 
 # training
 model.train()
-
+tflops_list = []
 for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10.0, desc="training"):
 
     if args.distplan == "colossalai":
         optimizer.zero_grad()
-
+        start = time()
         loss = model(next(train_loader))
+        fwd_end = time()
+        fwd_time = fwd_end - start
         # loss.backward()
         optimizer.backward(loss)
+        bwd_end = time()
+        bwd_time = bwd_end - fwd_end
 
-        print(f"training loss: {loss.item()}")
+        # print(f"training loss: {loss.item()}")
         torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
         # optim.step()
         # optim.zero_grad()
         optimizer.step()
+        optim_time = time() - bwd_end
+        step_time = time() - start
+
+        step_tflops = get_tflops_func(step_time)
+        logger.info(
+            f"[{i + 1}/{NUM_BATCHES}] Loss:{loss.item():.3f}, Step time: {step_time:.3f}s, TFLOPS: {get_tflops_func(step_time):.3f}, FWD time: {fwd_time:.3f}s, BWD time: {bwd_time:.3f}s, OPTIM time: {optim_time:.3f}s",
+            ranks=[0],
+        )
+        if i >= WARMUP_BATCHES:
+            tflops_list.append(step_tflops)
+    
     else:
         for __ in range(GRADIENT_ACCUMULATE_EVERY):
             loss = model(next(train_loader))
@@ -233,6 +261,11 @@ def __len__(self):
         torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
         optim.step()
         optim.zero_grad()
+    
+tflops_list.sort()
+median_index = ((NUM_BATCHES - WARMUP_BATCHES) >> 1) + WARMUP_BATCHES
+logger.info(f"Median TFLOPS is {tflops_list[median_index]:.3f}")
+
 
     # TODO
     # if i % VALIDATE_EVERY == 0:

From b3472d32e03ced200f1591aca5a50201ab44d274 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 10 Jan 2023 22:30:16 +0800
Subject: [PATCH 147/209] [workflow]auto comment with test coverage report
 (#2419)

* [workflow]auto comment with test coverage report

* polish code

* polish yaml
---
 .github/workflows/build.yml                   | 16 ++++-
 ...mment.yml => report_precommit_failure.yml} |  4 +-
 .github/workflows/report_test_coverage.yml    | 69 +++++++++++++++++++
 .gitignore                                    |  3 +-
 4 files changed, 88 insertions(+), 4 deletions(-)
 rename .github/workflows/{comment.yml => report_precommit_failure.yml} (94%)
 create mode 100644 .github/workflows/report_test_coverage.yml

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 25c8a395734a..30b932729019 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -75,12 +75,26 @@ jobs:
 
       - name: Unit Testing
         run: |
-          PYTHONPATH=$PWD pytest --cov=. --cov-report lcov tests
+          PYTHONPATH=$PWD pytest --cov=. --cov-report xml tests
         env:
           DATA: /data/scratch/cifar-10
           NCCL_SHM_DISABLE: 1
           LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 
+      - name: Collate artifact
+        env:
+          PR_NUMBER: ${{ github.event.number }}
+        run: |
+          mkdir report
+          echo $PR_NUMBER > ./report/pr_number
+          mv coverage.xml ./report
+
+      - name: Upload test coverage artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: report
+          path: report/
+
       - name: Store Cache
         run: |
           # -p flag is required to preserve the file timestamp to avoid ninja rebuild
diff --git a/.github/workflows/comment.yml b/.github/workflows/report_precommit_failure.yml
similarity index 94%
rename from .github/workflows/comment.yml
rename to .github/workflows/report_precommit_failure.yml
index 9f873bad7ae0..e6ca7b01bcc1 100644
--- a/.github/workflows/comment.yml
+++ b/.github/workflows/report_precommit_failure.yml
@@ -1,4 +1,4 @@
-name: Auto Workflow Comment
+name: Report Precommit Failure
 
 on:
   workflow_run:
@@ -11,7 +11,7 @@ jobs:
   # if the pre-commit check was not passed
   report-precommit-failure:
     runs-on: ubuntu-latest
-    if: ${{ github.event.workflow_run.name }} == "pre-commit" && ${{ github.event.workflow_run.conclusion == 'failure' }}
+    if: ${{ github.event.workflow_run.conclusion == 'failure' }}
     steps:
       - name: 'Download artifact'
         uses: actions/github-script@v6
diff --git a/.github/workflows/report_test_coverage.yml b/.github/workflows/report_test_coverage.yml
new file mode 100644
index 000000000000..167aa28b6b62
--- /dev/null
+++ b/.github/workflows/report_test_coverage.yml
@@ -0,0 +1,69 @@
+name: Report Test Coverage
+
+on:
+  workflow_run:
+    workflows: [Build]
+    types:
+      - completed
+
+jobs:
+  report-test-coverage:
+    runs-on: ubuntu-latest
+    steps:
+      - name: 'Download artifact'
+        uses: actions/github-script@v6
+        with:
+          script: |
+            let allArtifacts = await github.rest.actions.listWorkflowRunArtifacts({
+               owner: context.repo.owner,
+               repo: context.repo.repo,
+               run_id: context.payload.workflow_run.id,
+            });
+            let matchArtifact = allArtifacts.data.artifacts.filter((artifact) => {
+              return artifact.name == "report"
+            })[0];
+            let download = await github.rest.actions.downloadArtifact({
+               owner: context.repo.owner,
+               repo: context.repo.repo,
+               artifact_id: matchArtifact.id,
+               archive_format: 'zip',
+            });
+            let fs = require('fs');
+            fs.writeFileSync(`${process.env.GITHUB_WORKSPACE}/report.zip`, Buffer.from(download.data));
+
+      - name: 'Unzip artifact'
+        run: |
+          unzip report.zip
+
+      - name: Code Coverage Report
+        uses: irongut/CodeCoverageSummary@v1.3.0
+        with:
+          filename: coverage.xml
+          badge: true
+          fail_below_min: true
+          format: markdown
+          hide_branch_rate: false
+          hide_complexity: false
+          indicators: true
+          output: both
+          thresholds: '80 90'
+
+      - name: 'Comment on PR'
+        uses: actions/github-script@v6
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            let fs = require('fs');
+            let issue_number = Number(fs.readFileSync('./pr_number'));
+            let owner = context.repo.owner;
+            let repo = context.repo.repo;
+            let run_id = context.payload.workflow_run.id;
+            let run_url = `https://github.com/${owner}/${repo}/actions/runs/${run_id}`
+            let body = fs.readFileSync('./code-coverage-results.md', {encoding:'utf8', flag:'r'})
+
+            await github.rest.issues.createComment({
+              owner: owner,
+              repo: repo,
+              issue_number: issue_number,
+              body: body
+            });
diff --git a/.gitignore b/.gitignore
index 8e345eeb8388..bf74a753894f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -153,4 +153,5 @@ colossalai/version.py
 .pyi
 
 # ignore coverage test file
-converage.lcov
+coverage.lcov
+coverage.xml

From cd38167c1a4742ac3810b98ff13be0ab64fbeb9c Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 10 Jan 2023 22:30:32 +0800
Subject: [PATCH 148/209] [doc] added documentation for CI/CD (#2420)

* [doc] added documentation for CI/CD

* polish markdown

* polish markdown

* polish markdown
---
 .github/workflows/README.md | 121 ++++++++++++++++++++++++++++++++++++
 README-zh-Hans.md           |   5 ++
 README.md                   |   5 ++
 3 files changed, 131 insertions(+)
 create mode 100644 .github/workflows/README.md

diff --git a/.github/workflows/README.md b/.github/workflows/README.md
new file mode 100644
index 000000000000..65017a397c81
--- /dev/null
+++ b/.github/workflows/README.md
@@ -0,0 +1,121 @@
+# CI/CD
+
+## Table of Contents
+
+- [CI/CD](#cicd)
+  - [Table of Contents](#table-of-contents)
+  - [Overview](#overview)
+  - [Workflows](#workflows)
+    - [Checks on Pull Requests](#checks-on-pull-requests)
+    - [Regular Checks](#regular-checks)
+    - [Release](#release)
+    - [Manual Dispatch](#manual-dispatch)
+      - [Release bdist wheel](#release-bdist-wheel)
+      - [Dispatch Example Test](#dispatch-example-test)
+      - [Compatibility Test](#compatibility-test)
+    - [User Friendliness](#user-friendliness)
+  - [Progress Log](#progress-log)
+
+## Overview
+
+Automation makes our development more efficient as the machine automatically run the pre-defined tasks for the contributors.
+This saves a lot of manual work and allow the developer to fully focus on the features and bug fixes.
+In Colossal-AI, we use [GitHub Actions](https://github.com/features/actions) to automate a wide range of workflows to ensure the robustness of the software.
+In the section below, we will dive into the details of different workflows available.
+
+## Workflows
+
+### Checks on Pull Requests
+
+| Workflow Name               | File name                      | Description                                                                                                                                       |
+| --------------------------- | ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `Build`                     | `build.yml`                    | This workflow is triggered when the label `Run build and Test` is assigned to a PR. It will run all the unit tests in the repository with 4 GPUs. |
+| `Pre-commit`                | `pre_commit.yml`               | This workflow runs pre-commit checks for code style consistency.                                                                                  |
+| `Report pre-commit failure` | `report_precommit_failure.yml` | This PR will put up a comment in the PR to explain the precommit failure and remedy. This is executed when `Pre-commit` is done                   |
+| `Report test coverage`      | `report_test_coverage.yml`     | This PR will put up a comment to report the test coverage results. This is executed when `Build` is completed.                                    |
+| `Test example`              | `auto_example_check.yml`       | The example will be automatically tested if its files are changed in the PR                                                                       |
+
+### Regular Checks
+
+| Workflow Name           | File name                | Description                                                                                                            |
+| ----------------------- | ------------------------ | ---------------------------------------------------------------------------------------------------------------------- |
+| `Test example`          | `auto_example_check.yml` | This workflow will test all examples every Sunday                                                                      |
+| `Build on 8 GPUs`       | `build_gpu_8.yml`        | This workflow will run the unit tests everyday with 8 GPUs.                                                            |
+| `Synchronize submodule` | `submodule.yml`          | This workflow will check if any git submodule is updated. If so, it will create a PR to update the submodule pointers. |
+| `Close inactive issues` | `close_inactive.yml`     | This workflow will close issues which are stale for 14 days.                                                           |
+
+### Release
+
+| Workflow Name               | File name                       | Description                                                                                                       |
+| --------------------------- | ------------------------------- | ----------------------------------------------------------------------------------------------------------------- |
+| `Draft GitHub Release Post` | `draft_github_release_post.yml` | Compose a GitHub release post draft based on the commit history. Triggered when `version.txt` is updated.         |
+| `Release to PyPI`           | `release_pypi.yml`              | Build and release the wheel to PyPI. Triggered when `version.txt` is updated.                                     |
+| `Release Nightly to PyPI`   | `release_nightly.yml`           | Build and release the nightly wheel to PyPI as `colossalai-nightly`. Automatically executed every Sunday.         |
+| `Release Docker`            | `release_docker.yml`            | Build and release the Docker image to DockerHub. Triggered when `version.txt` is updated.                         |
+| `Release bdist wheel`       | `release_bdist.yml`             | Build binary wheels with pre-built PyTorch extensions. Manually dispatched. See more details in the next section. |
+
+### Manual Dispatch
+
+| Workflow Name           | File name                    | Description                                            |
+| ----------------------- | ---------------------------- | ------------------------------------------------------ |
+| `Release bdist wheel`   | `release_bdist.yml`          | Build binary wheels with pre-built PyTorch extensions. |
+| `Dispatch Example Test` | `dispatch_example_check.yml` | Manually test a specified example.                     |
+| `Compatiblity Test`     | `compatiblity_test.yml`      | Test PyTorch and Python Compatibility.                 |
+
+Refer to this [documentation](https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow) on how to manually trigger a workflow.
+I will provide the details of each workflow below.
+
+#### Release bdist wheel
+
+Parameters:
+- `torch version`:torch version to test against, multiple versions are supported but must be separated by comma. The default is value is all, which will test all available torch versions listed in this [repository](https://github.com/hpcaitech/public_assets/tree/main/colossalai/torch_build/torch_wheels) which is regularly updated.
+- `cuda version`: cuda versions to test against, multiple versions are supported but must be separated by comma. The CUDA versions must be present in our [DockerHub repository](https://hub.docker.com/r/hpcaitech/cuda-conda).
+- `ref`: input the branch or tag name to build the wheel for this ref.
+
+#### Dispatch Example Test
+
+parameters:
+- `example_directory`: the example directory to test. Multiple directories are supported and must be separated by comma. For example, language/gpt, images/vit. Simply input language or simply gpt does not work.
+
+
+#### Compatibility Test
+
+Parameters:
+- `torch version`:torch version to test against, multiple versions are supported but must be separated by comma. The default is value is all, which will test all available torch versions listed in this [repository](https://github.com/hpcaitech/public_assets/tree/main/colossalai/torch_build/torch_wheels).
+- `cuda version`: cuda versions to test against, multiple versions are supported but must be separated by comma. The CUDA versions must be present in our [DockerHub repository](https://hub.docker.com/r/hpcaitech/cuda-conda).
+
+> It only test the compatiblity of the main branch
+
+
+### User Friendliness
+
+| Workflow Name     | File name               | Description                                                                                                                            |
+| ----------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------- |
+| `issue-translate` | `translate_comment.yml` | This workflow is triggered when a new issue comment is created. The comment will be translated into English if not written in English. |
+
+## Progress Log
+
+- [x] unit testing
+  - [x] test on PR
+  - [x] report test coverage
+  - [x] regular test
+- [x] release
+  - [x] official release
+  - [x] nightly build
+  - [x] binary build
+  - [x] docker build
+  - [x] draft release post
+- [x] pre-commit
+  - [x] check on PR
+  - [x] report failure
+- [x] example check
+  - [x] check on PR
+  - [x] regular check
+  - [x] manual dispatch
+- [ ] compatiblity check
+  - [x] manual dispatch
+  - [ ] auto test when release
+- [x] helpers
+  - [x] comment translation
+  - [x] submodule update
+  - [x] close inactive issue
diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index b97b02f5ab84..6b1848c4bdd7 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -349,6 +349,11 @@ docker run -ti --gpus all --rm --ipc=host colossalai bash
 <p align="right">(<a href="#top">返回顶端</a>)</p>
 
 
+## CI/CD
+
+我们使用[GitHub Actions](https://github.com/features/actions)来自动化大部分开发以及部署流程。如果想了解这些工作流是如何运行的，请查看这个[文档](.github/workflows/README.md).
+
+
 ## 引用我们
 
 ```
diff --git a/README.md b/README.md
index 7aba907e0a64..396260e97399 100644
--- a/README.md
+++ b/README.md
@@ -353,6 +353,11 @@ Thanks so much to all of our amazing contributors!
 <p align="right">(<a href="#top">back to top</a>)</p>
 
 
+## CI/CD
+
+We leverage the power of [GitHub Actions](https://github.com/features/actions) to automate our development, release and deployment workflows. Please check out this [documentation](.github/workflows/README.md) on how the automated workflows are operated.
+
+
 ## Cite Us
 
 ```

From 63be79d5057843049d287ec29d92c96fab6f3437 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 11 Jan 2023 10:07:18 +0800
Subject: [PATCH 149/209] [example] removed duplicated stable diffusion example
 (#2424)

---
 examples/tutorial/README.md                   |   26 -
 examples/tutorial/stable_diffusion/LICENSE    |   82 -
 examples/tutorial/stable_diffusion/README.md  |  149 --
 .../configs/train_colossalai.yaml             |  116 --
 .../configs/train_colossalai_cifar10.yaml     |  123 --
 .../stable_diffusion/configs/train_ddp.yaml   |  113 --
 .../configs/train_pokemon.yaml                |  121 --
 .../stable_diffusion/environment.yaml         |   34 -
 .../stable_diffusion/ldm/data/__init__.py     |    0
 .../stable_diffusion/ldm/data/base.py         |   75 -
 .../stable_diffusion/ldm/data/cifar10.py      |  184 --
 .../stable_diffusion/ldm/data/imagenet.py     |  394 -----
 .../stable_diffusion/ldm/data/lsun.py         |   92 -
 .../stable_diffusion/ldm/lr_scheduler.py      |   98 --
 .../ldm/models/autoencoder.py                 |  544 ------
 .../ldm/models/diffusion/__init__.py          |    0
 .../ldm/models/diffusion/classifier.py        |  267 ---
 .../ldm/models/diffusion/ddim.py              |  240 ---
 .../ldm/models/diffusion/ddpm.py              | 1554 -----------------
 .../ldm/models/diffusion/plms.py              |  236 ---
 .../stable_diffusion/ldm/modules/attention.py |  314 ----
 .../ldm/modules/diffusionmodules/__init__.py  |    0
 .../ldm/modules/diffusionmodules/model.py     |  862 ---------
 .../modules/diffusionmodules/openaimodel.py   | 1152 ------------
 .../ldm/modules/diffusionmodules/util.py      |  276 ---
 .../ldm/modules/distributions/__init__.py     |    0
 .../modules/distributions/distributions.py    |   92 -
 .../stable_diffusion/ldm/modules/ema.py       |   76 -
 .../ldm/modules/encoders/__init__.py          |    0
 .../ldm/modules/encoders/modules.py           |  264 ---
 .../ldm/modules/flash_attention.py            |   50 -
 .../ldm/modules/image_degradation/__init__.py |    2 -
 .../ldm/modules/image_degradation/bsrgan.py   |  730 --------
 .../modules/image_degradation/bsrgan_light.py |  650 -------
 .../modules/image_degradation/utils/test.png  |  Bin 441072 -> 0 bytes
 .../modules/image_degradation/utils_image.py  |  916 ----------
 .../ldm/modules/losses/__init__.py            |    1 -
 .../ldm/modules/losses/contperceptual.py      |  111 --
 .../ldm/modules/losses/vqperceptual.py        |  167 --
 .../ldm/modules/x_transformer.py              |  641 -------
 .../tutorial/stable_diffusion/ldm/util.py     |  203 ---
 examples/tutorial/stable_diffusion/main.py    |  830 ---------
 .../stable_diffusion/requirements.txt         |   22 -
 .../scripts/download_first_stages.sh          |   41 -
 .../scripts/download_models.sh                |   49 -
 .../stable_diffusion/scripts/img2img.py       |  293 ----
 .../stable_diffusion/scripts/inpaint.py       |   98 --
 .../stable_diffusion/scripts/knn2img.py       |  398 -----
 .../scripts/sample_diffusion.py               |  313 ----
 .../scripts/tests/test_checkpoint.py          |   37 -
 .../scripts/tests/test_watermark.py           |   18 -
 .../scripts/train_searcher.py                 |  147 --
 .../stable_diffusion/scripts/txt2img.py       |  344 ----
 examples/tutorial/stable_diffusion/setup.py   |   13 -
 examples/tutorial/stable_diffusion/train.sh   |    4 -
 55 files changed, 13562 deletions(-)
 delete mode 100644 examples/tutorial/stable_diffusion/LICENSE
 delete mode 100644 examples/tutorial/stable_diffusion/README.md
 delete mode 100644 examples/tutorial/stable_diffusion/configs/train_colossalai.yaml
 delete mode 100644 examples/tutorial/stable_diffusion/configs/train_colossalai_cifar10.yaml
 delete mode 100644 examples/tutorial/stable_diffusion/configs/train_ddp.yaml
 delete mode 100644 examples/tutorial/stable_diffusion/configs/train_pokemon.yaml
 delete mode 100644 examples/tutorial/stable_diffusion/environment.yaml
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/data/__init__.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/data/base.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/data/cifar10.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/data/imagenet.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/data/lsun.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/lr_scheduler.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/models/autoencoder.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/models/diffusion/__init__.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/models/diffusion/classifier.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/models/diffusion/ddim.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/models/diffusion/ddpm.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/models/diffusion/plms.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/attention.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/__init__.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/model.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/openaimodel.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/util.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/distributions/__init__.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/distributions/distributions.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/ema.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/encoders/__init__.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/encoders/modules.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/flash_attention.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/image_degradation/__init__.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/image_degradation/bsrgan.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/image_degradation/bsrgan_light.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/image_degradation/utils/test.png
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/image_degradation/utils_image.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/losses/__init__.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/losses/contperceptual.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/losses/vqperceptual.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/modules/x_transformer.py
 delete mode 100644 examples/tutorial/stable_diffusion/ldm/util.py
 delete mode 100644 examples/tutorial/stable_diffusion/main.py
 delete mode 100644 examples/tutorial/stable_diffusion/requirements.txt
 delete mode 100644 examples/tutorial/stable_diffusion/scripts/download_first_stages.sh
 delete mode 100644 examples/tutorial/stable_diffusion/scripts/download_models.sh
 delete mode 100644 examples/tutorial/stable_diffusion/scripts/img2img.py
 delete mode 100644 examples/tutorial/stable_diffusion/scripts/inpaint.py
 delete mode 100644 examples/tutorial/stable_diffusion/scripts/knn2img.py
 delete mode 100644 examples/tutorial/stable_diffusion/scripts/sample_diffusion.py
 delete mode 100644 examples/tutorial/stable_diffusion/scripts/tests/test_checkpoint.py
 delete mode 100644 examples/tutorial/stable_diffusion/scripts/tests/test_watermark.py
 delete mode 100644 examples/tutorial/stable_diffusion/scripts/train_searcher.py
 delete mode 100644 examples/tutorial/stable_diffusion/scripts/txt2img.py
 delete mode 100644 examples/tutorial/stable_diffusion/setup.py
 delete mode 100644 examples/tutorial/stable_diffusion/train.sh

diff --git a/examples/tutorial/README.md b/examples/tutorial/README.md
index bef7c8905033..9c61e41cd146 100644
--- a/examples/tutorial/README.md
+++ b/examples/tutorial/README.md
@@ -39,9 +39,6 @@ quickly deploy large AI model training and inference, reducing large AI model tr
    - Try pre-trained OPT model weights with Colossal-AI
    - Fine-tuning OPT with limited hardware using ZeRO, Gemini and parallelism
    - Deploy the fine-tuned model to inference service
- - Acceleration of Stable Diffusion
-   - Stable Diffusion with Lightning
-   - Try Lightning Colossal-AI strategy to optimize memory and accelerate speed
 
 
 ## Discussion
@@ -168,26 +165,3 @@ docker run -it --rm --gpus all --ipc host -p 7070:7070 hpcaitech/tutorial:opt-in
 ```bash
 python opt_fastapi.py opt-125m --tp 2 --checkpoint /data/opt-125m
 ```
-
-## 🖼️ Accelerate Stable Diffusion with Colossal-AI
-1. Create a new environment for diffusion
-```bash
-conda env create -f environment.yaml
-conda activate ldm
-```
-2. Install Colossal-AI from our official page
-```bash
-pip install colossalai==0.1.10+torch1.11cu11.3 -f https://release.colossalai.org
-```
-3. Install PyTorch Lightning compatible commit
-```bash
-git clone https://github.com/Lightning-AI/lightning && cd lightning && git reset --hard b04a7aa
-pip install -r requirements.txt && pip install .
-cd ..
-```
-
-4. Comment out the `from_pretrained` field in the `train_colossalai_cifar10.yaml`.
-5. Run training with CIFAR10.
-```bash
-python main.py -logdir /tmp -t true -postfix test -b configs/train_colossalai_cifar10.yaml
-```
diff --git a/examples/tutorial/stable_diffusion/LICENSE b/examples/tutorial/stable_diffusion/LICENSE
deleted file mode 100644
index 0e609df0d8cd..000000000000
--- a/examples/tutorial/stable_diffusion/LICENSE
+++ /dev/null
@@ -1,82 +0,0 @@
-Copyright (c) 2022 Robin Rombach and Patrick Esser and contributors
-
-CreativeML Open RAIL-M
-dated August 22, 2022
-
-Section I: PREAMBLE
-
-Multimodal generative models are being widely adopted and used, and have the potential to transform the way artists, among other individuals, conceive and benefit from AI or ML technologies as a tool for content creation.
-
-Notwithstanding the current and potential benefits that these artifacts can bring to society at large, there are also concerns about potential misuses of them, either due to their technical limitations or ethical considerations.
-
-In short, this license strives for both the open and responsible downstream use of the accompanying model. When it comes to the open character, we took inspiration from open source permissive licenses regarding the grant of IP rights. Referring to the downstream responsible use, we added use-based restrictions not permitting the use of the Model in very specific scenarios, in order for the licensor to be able to enforce the license in case potential misuses of the Model may occur. At the same time, we strive to promote open and responsible research on generative models for art and content generation.
-
-Even though downstream derivative versions of the model could be released under different licensing terms, the latter will always have to include - at minimum - the same use-based restrictions as the ones in the original license (this license). We believe in the intersection between open and responsible AI development; thus, this License aims to strike a balance between both in order to enable responsible open-science in the field of AI.
-
-This License governs the use of the model (and its derivatives) and is informed by the model card associated with the model.
-
-NOW THEREFORE, You and Licensor agree as follows:
-
-1. Definitions
-
-- "License" means the terms and conditions for use, reproduction, and Distribution as defined in this document.
-- "Data" means a collection of information and/or content extracted from the dataset used with the Model, including to train, pretrain, or otherwise evaluate the Model. The Data is not licensed under this License.
-- "Output" means the results of operating a Model as embodied in informational content resulting therefrom.
-- "Model" means any accompanying machine-learning based assemblies (including checkpoints), consisting of learnt weights, parameters (including optimizer states), corresponding to the model architecture as embodied in the Complementary Material, that have been trained or tuned, in whole or in part on the Data, using the Complementary Material.
-- "Derivatives of the Model" means all modifications to the Model, works based on the Model, or any other model which is created or initialized by transfer of patterns of the weights, parameters, activations or output of the Model, to the other model, in order to cause the other model to perform similarly to the Model, including - but not limited to - distillation methods entailing the use of intermediate data representations or methods based on the generation of synthetic data by the Model for training the other model.
-- "Complementary Material" means the accompanying source code and scripts used to define, run, load, benchmark or evaluate the Model, and used to prepare data for training or evaluation, if any. This includes any accompanying documentation, tutorials, examples, etc, if any.
-- "Distribution" means any transmission, reproduction, publication or other sharing of the Model or Derivatives of the Model to a third party, including providing the Model as a hosted service made available by electronic or other remote means - e.g. API-based or web access.
-- "Licensor" means the copyright owner or entity authorized by the copyright owner that is granting the License, including the persons or entities that may have rights in the Model and/or distributing the Model.
-- "You" (or "Your") means an individual or Legal Entity exercising permissions granted by this License and/or making use of the Model for whichever purpose and in any field of use, including usage of the Model in an end-use application - e.g. chatbot, translator, image generator.
-- "Third Parties" means individuals or legal entities that are not under common control with Licensor or You.
-- "Contribution" means any work of authorship, including the original version of the Model and any modifications or additions to that Model or Derivatives of the Model thereof, that is intentionally submitted to Licensor for inclusion in the Model by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Model, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
-- "Contributor" means Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Model.
-
-Section II: INTELLECTUAL PROPERTY RIGHTS
-
-Both copyright and patent grants apply to the Model, Derivatives of the Model and Complementary Material. The Model and Derivatives of the Model are subject to additional terms as described in Section III.
-
-2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare, publicly display, publicly perform, sublicense, and distribute the Complementary Material, the Model, and Derivatives of the Model.
-3. Grant of Patent License. Subject to the terms and conditions of this License and where and as applicable, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this paragraph) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Model and the Complementary Material, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Model to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Model and/or Complementary Material or a Contribution incorporated within the Model and/or Complementary Material constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for the Model and/or Work shall terminate as of the date such litigation is asserted or filed.
-
-Section III: CONDITIONS OF USAGE, DISTRIBUTION AND REDISTRIBUTION
-
-4. Distribution and Redistribution. You may host for Third Party remote access purposes (e.g. software-as-a-service), reproduce and distribute copies of the Model or Derivatives of the Model thereof in any medium, with or without modifications, provided that You meet the following conditions:
-Use-based restrictions as referenced in paragraph 5 MUST be included as an enforceable provision by You in any type of legal agreement (e.g. a license) governing the use and/or distribution of the Model or Derivatives of the Model, and You shall give notice to subsequent users You Distribute to, that the Model or Derivatives of the Model are subject to paragraph 5. This provision does not apply to the use of Complementary Material.
-You must give any Third Party recipients of the Model or Derivatives of the Model a copy of this License;
-You must cause any modified files to carry prominent notices stating that You changed the files;
-You must retain all copyright, patent, trademark, and attribution notices excluding those notices that do not pertain to any part of the Model, Derivatives of the Model.
-You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions - respecting paragraph 4.a. - for use, reproduction, or Distribution of Your modifications, or for any such Derivatives of the Model as a whole, provided Your use, reproduction, and Distribution of the Model otherwise complies with the conditions stated in this License.
-5. Use-based restrictions. The restrictions set forth in Attachment A are considered Use-based restrictions. Therefore You cannot use the Model and the Derivatives of the Model for the specified restricted uses. You may use the Model subject to this License, including only for lawful purposes and in accordance with the License. Use may include creating any content with, finetuning, updating, running, training, evaluating and/or reparametrizing the Model. You shall require all of Your users who use the Model or a Derivative of the Model to comply with the terms of this paragraph (paragraph 5).
-6. The Output You Generate. Except as set forth herein, Licensor claims no rights in the Output You generate using the Model. You are accountable for the Output you generate and its subsequent uses. No use of the output can contravene any provision as stated in the License.
-
-Section IV: OTHER PROVISIONS
-
-7. Updates and Runtime Restrictions. To the maximum extent permitted by law, Licensor reserves the right to restrict (remotely or otherwise) usage of the Model in violation of this License, update the Model through electronic means, or modify the Output of the Model based on updates. You shall undertake reasonable efforts to use the latest version of the Model.
-8. Trademarks and related. Nothing in this License permits You to make use of Licensors’ trademarks, trade names, logos or to otherwise suggest endorsement or misrepresent the relationship between the parties; and any rights not expressly granted herein are reserved by the Licensors.
-9. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Model and the Complementary Material (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Model, Derivatives of the Model, and the Complementary Material and assume any risks associated with Your exercise of permissions under this License.
-10. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Model and the Complementary Material (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
-11. Accepting Warranty or Additional Liability. While redistributing the Model, Derivatives of the Model and the Complementary Material thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
-12. If any provision of this License is held to be invalid, illegal or unenforceable, the remaining provisions shall be unaffected thereby and remain valid as if such provision had not been set forth herein.
-
-END OF TERMS AND CONDITIONS
-
-
-
-
-Attachment A
-
-Use Restrictions
-
-You agree not to use the Model or Derivatives of the Model:
-- In any way that violates any applicable national, federal, state, local or international law or regulation;
-- For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
-- To generate or disseminate verifiably false information and/or content with the purpose of harming others;
-- To generate or disseminate personal identifiable information that can be used to harm an individual;
-- To defame, disparage or otherwise harass others;
-- For fully automated decision making that adversely impacts an individual’s legal rights or otherwise creates or modifies a binding, enforceable obligation;
-- For any use intended to or which has the effect of discriminating against or harming individuals or groups based on online or offline social behavior or known or predicted personal or personality characteristics;
-- To exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
-- For any use intended to or which has the effect of discriminating against individuals or groups based on legally protected characteristics or categories;
-- To provide medical advice and medical results interpretation;
-- To generate or disseminate information for the purpose to be used for administration of justice, law enforcement, immigration or asylum processes, such as predicting an individual will commit fraud/crime commitment (e.g. by text profiling, drawing causal relationships between assertions made in documents, indiscriminate and arbitrarily-targeted use).
diff --git a/examples/tutorial/stable_diffusion/README.md b/examples/tutorial/stable_diffusion/README.md
deleted file mode 100644
index a0ece4485d27..000000000000
--- a/examples/tutorial/stable_diffusion/README.md
+++ /dev/null
@@ -1,149 +0,0 @@
-# Stable Diffusion with Colossal-AI
-*[Colosssal-AI](https://github.com/hpcaitech/ColossalAI) provides a faster and lower cost solution for pretraining and
-fine-tuning for AIGC (AI-Generated Content) applications such as the model [stable-diffusion](https://github.com/CompVis/stable-diffusion) from [Stability AI](https://stability.ai/).*
-
-We take advantage of [Colosssal-AI](https://github.com/hpcaitech/ColossalAI) to exploit multiple optimization strategies
-, e.g. data parallelism, tensor parallelism, mixed precision & ZeRO, to scale the training to multiple GPUs.
-
-## 🚀Quick Start
-1. Create a new environment for diffusion
-```bash
-conda env create -f environment.yaml
-conda activate ldm
-```
-2. Install Colossal-AI from our official page
-```bash
-pip install colossalai==0.1.10+torch1.11cu11.3 -f https://release.colossalai.org
-```
-3. Install PyTorch Lightning compatible commit
-```bash
-git clone https://github.com/Lightning-AI/lightning && cd lightning && git reset --hard b04a7aa
-pip install -r requirements.txt && pip install .
-cd ..
-```
-
-4. Comment out the `from_pretrained` field in the `train_colossalai_cifar10.yaml`.
-5. Run training with CIFAR10.
-```bash
-python main.py -logdir /tmp -t true -postfix test -b configs/train_colossalai_cifar10.yaml
-```
-
-## Stable Diffusion
-[Stable Diffusion](https://huggingface.co/CompVis/stable-diffusion) is a latent text-to-image diffusion
-model.
-Thanks to a generous compute donation from [Stability AI](https://stability.ai/) and support from [LAION](https://laion.ai/), we were able to train a Latent Diffusion Model on 512x512 images from a subset of the [LAION-5B](https://laion.ai/blog/laion-5b/) database.
-Similar to Google's [Imagen](https://arxiv.org/abs/2205.11487),
-this model uses a frozen CLIP ViT-L/14 text encoder to condition the model on text prompts.
-
-<p id="diffusion_train" align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/diffusion_train.png" width=800/>
-</p>
-
-[Stable Diffusion with Colossal-AI](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/diffusion) provides **6.5x faster training and pretraining cost saving, the hardware cost of fine-tuning can be almost 7X cheaper** (from RTX3090/4090 24GB to RTX3050/2070 8GB).
-
-<p id="diffusion_demo" align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/diffusion_demo.png" width=800/>
-</p>
-
-## Requirements
-A suitable [conda](https://conda.io/) environment named `ldm` can be created
-and activated with:
-
-```
-conda env create -f environment.yaml
-conda activate ldm
-```
-
-You can also update an existing [latent diffusion](https://github.com/CompVis/latent-diffusion) environment by running
-
-```
-conda install pytorch torchvision -c pytorch
-pip install transformers==4.19.2 diffusers invisible-watermark
-pip install -e .
-```
-
-### Install [Colossal-AI v0.1.10](https://colossalai.org/download/) From Our Official Website
-```
-pip install colossalai==0.1.10+torch1.11cu11.3 -f https://release.colossalai.org
-```
-
-### Install [Lightning](https://github.com/Lightning-AI/lightning)
-We use the Sep. 2022 version with commit id as `b04a7aa`.
-```
-git clone https://github.com/Lightning-AI/lightning && cd lightning && git reset --hard b04a7aa
-pip install -r requirements.txt && pip install .
-```
-
-> The specified version is due to the interface incompatibility caused by the latest update of [Lightning](https://github.com/Lightning-AI/lightning), which will be fixed in the near future.
-
-## Dataset
-The dataSet is from [LAION-5B](https://laion.ai/blog/laion-5b/), the subset of [LAION](https://laion.ai/),
-you should the change the `data.file_path` in the `config/train_colossalai.yaml`
-
-## Training
-
-We provide the script `train.sh` to run the training task , and two Stategy in `configs`:`train_colossalai.yaml`
-
-For example, you can run the training from colossalai by
-```
-python main.py --logdir /tmp -t --postfix test -b configs/train_colossalai.yaml
-```
-
-- you can change the `--logdir` the save the log information and the last checkpoint
-
-### Training config
-You can change the trainging config in the yaml file
-
-- accelerator: acceleratortype, default 'gpu'
-- devices: device number used for training, default 4
-- max_epochs: max training epochs
-- precision: usefp16 for training or not, default 16, you must use fp16 if you want to apply colossalai
-
-## Example
-
-### Training on cifar10
-
-We provide the finetuning example on CIFAR10 dataset
-
-You can run by config `train_colossalai_cifar10.yaml`
-```
-python main.py --logdir /tmp -t --postfix test -b configs/train_colossalai_cifar10.yaml
-```
-
-
-
-## Comments
-
-- Our codebase for the diffusion models builds heavily on [OpenAI's ADM codebase](https://github.com/openai/guided-diffusion)
-, [lucidrains](https://github.com/lucidrains/denoising-diffusion-pytorch),
-[Stable Diffusion](https://github.com/CompVis/stable-diffusion), [Lightning](https://github.com/Lightning-AI/lightning) and [Hugging Face](https://huggingface.co/CompVis/stable-diffusion).
-Thanks for open-sourcing!
-
-- The implementation of the transformer encoder is from [x-transformers](https://github.com/lucidrains/x-transformers) by [lucidrains](https://github.com/lucidrains?tab=repositories).
-
-- The implementation of [flash attention](https://github.com/HazyResearch/flash-attention) is from [HazyResearch](https://github.com/HazyResearch).
-
-## BibTeX
-
-```
-@article{bian2021colossal,
-  title={Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Training},
-  author={Bian, Zhengda and Liu, Hongxin and Wang, Boxiang and Huang, Haichen and Li, Yongbin and Wang, Chuanrui and Cui, Fan and You, Yang},
-  journal={arXiv preprint arXiv:2110.14883},
-  year={2021}
-}
-@misc{rombach2021highresolution,
-  title={High-Resolution Image Synthesis with Latent Diffusion Models},
-  author={Robin Rombach and Andreas Blattmann and Dominik Lorenz and Patrick Esser and Björn Ommer},
-  year={2021},
-  eprint={2112.10752},
-  archivePrefix={arXiv},
-  primaryClass={cs.CV}
-}
-@article{dao2022flashattention,
-  title={FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness},
-  author={Dao, Tri and Fu, Daniel Y. and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher},
-  journal={arXiv preprint arXiv:2205.14135},
-  year={2022}
-}
-```
diff --git a/examples/tutorial/stable_diffusion/configs/train_colossalai.yaml b/examples/tutorial/stable_diffusion/configs/train_colossalai.yaml
deleted file mode 100644
index c457787dd881..000000000000
--- a/examples/tutorial/stable_diffusion/configs/train_colossalai.yaml
+++ /dev/null
@@ -1,116 +0,0 @@
-model:
-  base_learning_rate: 1.0e-04
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: caption
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false   # Note: different from the one we trained before
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-
-    scheduler_config: # 10000 warmup steps
-      target: ldm.lr_scheduler.LambdaLinearScheduler
-      params:
-        warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch
-        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
-        f_start: [ 1.e-6 ]
-        f_max: [ 1.e-4 ]
-        f_min: [ 1.e-10 ]
-
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 32 # unused
-        from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/unet/diffusion_pytorch_model.bin'
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: False
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/vae/diffusion_pytorch_model.bin'
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
-      params:
-        use_fp16: True
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 64
-    wrap: False
-    train:
-      target: ldm.data.base.Txt2ImgIterableBaseDataset
-      params:
-        file_path: "/data/scratch/diffuser/laion_part0/"
-        world_size: 1
-        rank: 0
-
-lightning:
-  trainer:
-    accelerator: 'gpu' 
-    devices: 4
-    log_gpu_memory: all
-    max_epochs: 2
-    precision: 16
-    auto_select_gpus: False
-    strategy:
-      target: pytorch_lightning.strategies.ColossalAIStrategy
-      params:
-        use_chunk: False
-        enable_distributed_storage: True,
-        placement_policy: cuda
-        force_outputs_fp32: False
-
-    log_every_n_steps: 2
-    logger: True
-    default_root_dir: "/tmp/diff_log/"
-    profiler: pytorch
-
-  logger_config:
-    wandb:
-      target: pytorch_lightning.loggers.WandbLogger
-      params:
-          name: nowname
-          save_dir: "/tmp/diff_log/"
-          offline: opt.debug
-          id: nowname
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/configs/train_colossalai_cifar10.yaml b/examples/tutorial/stable_diffusion/configs/train_colossalai_cifar10.yaml
deleted file mode 100644
index 63b9d1c0179c..000000000000
--- a/examples/tutorial/stable_diffusion/configs/train_colossalai_cifar10.yaml
+++ /dev/null
@@ -1,123 +0,0 @@
-model:
-  base_learning_rate: 1.0e-04
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: txt
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false   # Note: different from the one we trained before
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-
-    scheduler_config: # 10000 warmup steps
-      target: ldm.lr_scheduler.LambdaLinearScheduler
-      params:
-        warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch
-        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
-        f_start: [ 1.e-6 ]
-        f_max: [ 1.e-4 ]
-        f_min: [ 1.e-10 ]
-
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 32 # unused
-        from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/unet/diffusion_pytorch_model.bin'
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: False
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/vae/diffusion_pytorch_model.bin'
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
-      params:
-        use_fp16: True
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 4
-    num_workers: 4
-    train:
-      target: ldm.data.cifar10.hf_dataset
-      params:
-        name: cifar10
-        image_transforms:
-        - target: torchvision.transforms.Resize
-          params:
-            size: 512
-            interpolation: 3
-        - target: torchvision.transforms.RandomCrop
-          params:
-            size: 512
-        - target: torchvision.transforms.RandomHorizontalFlip
-
-lightning:
-  trainer:
-    accelerator: 'gpu' 
-    devices: 2
-    log_gpu_memory: all
-    max_epochs: 2
-    precision: 16
-    auto_select_gpus: False
-    strategy:
-      target: pytorch_lightning.strategies.ColossalAIStrategy
-      params:
-        use_chunk: False
-        enable_distributed_storage: True,
-        placement_policy: cuda
-        force_outputs_fp32: False
-
-    log_every_n_steps: 2
-    logger: True
-    default_root_dir: "/tmp/diff_log/"
-    profiler: pytorch
-
-  logger_config:
-    wandb:
-      target: pytorch_lightning.loggers.WandbLogger
-      params:
-          name: nowname
-          save_dir: "/tmp/diff_log/"
-          offline: opt.debug
-          id: nowname
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/configs/train_ddp.yaml b/examples/tutorial/stable_diffusion/configs/train_ddp.yaml
deleted file mode 100644
index 90d41258fada..000000000000
--- a/examples/tutorial/stable_diffusion/configs/train_ddp.yaml
+++ /dev/null
@@ -1,113 +0,0 @@
-model:
-  base_learning_rate: 1.0e-04
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: caption
-    image_size: 32
-    channels: 4
-    cond_stage_trainable: false   # Note: different from the one we trained before
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-
-    scheduler_config: # 10000 warmup steps
-      target: ldm.lr_scheduler.LambdaLinearScheduler
-      params:
-        warm_up_steps: [ 100 ]
-        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
-        f_start: [ 1.e-6 ]
-        f_max: [ 1.e-4 ]
-        f_min: [ 1.e-10  ]
-
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 32 # unused
-        from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/unet/diffusion_pytorch_model.bin'
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: False
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/vae/diffusion_pytorch_model.bin'
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
-      params:
-        use_fp16: True
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 64
-    wrap: False
-    train:
-      target: ldm.data.base.Txt2ImgIterableBaseDataset
-      params:
-        file_path: "/data/scratch/diffuser/laion_part0/"
-        world_size: 1
-        rank: 0
-
-lightning:
-  trainer:
-    accelerator: 'gpu' 
-    devices: 4
-    log_gpu_memory: all
-    max_epochs: 2
-    precision: 16
-    auto_select_gpus: False
-    strategy:
-      target: pytorch_lightning.strategies.DDPStrategy
-      params:
-        find_unused_parameters: False
-    log_every_n_steps: 2
-#    max_steps: 6o
-    logger: True
-    default_root_dir: "/tmp/diff_log/"
-    # profiler: pytorch
-
-  logger_config:
-    wandb:
-      target: pytorch_lightning.loggers.WandbLogger
-      params:
-          name: nowname
-          save_dir: "/tmp/diff_log/"
-          offline: opt.debug
-          id: nowname
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/configs/train_pokemon.yaml b/examples/tutorial/stable_diffusion/configs/train_pokemon.yaml
deleted file mode 100644
index 8b5d2adfaf17..000000000000
--- a/examples/tutorial/stable_diffusion/configs/train_pokemon.yaml
+++ /dev/null
@@ -1,121 +0,0 @@
-model:
-  base_learning_rate: 1.0e-04
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: caption
-    image_size: 32
-    channels: 4
-    cond_stage_trainable: false   # Note: different from the one we trained before
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    check_nan_inf: False
-
-    scheduler_config: # 10000 warmup steps
-      target: ldm.lr_scheduler.LambdaLinearScheduler
-      params:
-        warm_up_steps: [ 10000 ]
-        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
-        f_start: [ 1.e-6 ]
-        f_max: [ 1.e-4 ]
-        f_min: [ 1.e-10  ]
-
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 32 # unused
-        from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/unet/diffusion_pytorch_model.bin'
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: False
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/vae/diffusion_pytorch_model.bin'
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
-      params:
-        use_fp16: True
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 32
-    wrap: False
-    train:
-      target: ldm.data.pokemon.PokemonDataset
-      # params:
-        # file_path: "/data/scratch/diffuser/laion_part0/"
-        # world_size: 1
-        # rank: 0
-
-lightning:
-  trainer:
-    accelerator: 'gpu' 
-    devices: 4
-    log_gpu_memory: all
-    max_epochs: 2
-    precision: 16
-    auto_select_gpus: False
-    strategy:
-      target: pytorch_lightning.strategies.ColossalAIStrategy
-      params:
-        use_chunk: False
-        enable_distributed_storage: True,
-        placement_policy: cuda
-        force_outputs_fp32: False
-        initial_scale: 65536
-        min_scale: 1
-        max_scale: 65536
-        # max_scale: 4294967296
-
-    log_every_n_steps: 2
-    logger: True
-    default_root_dir: "/tmp/diff_log/"
-    profiler: pytorch
-
-  logger_config:
-    wandb:
-      target: pytorch_lightning.loggers.WandbLogger
-      params:
-          name: nowname
-          save_dir: "/tmp/diff_log/"
-          offline: opt.debug
-          id: nowname
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/environment.yaml b/examples/tutorial/stable_diffusion/environment.yaml
deleted file mode 100644
index 7d8aec86f288..000000000000
--- a/examples/tutorial/stable_diffusion/environment.yaml
+++ /dev/null
@@ -1,34 +0,0 @@
-name: ldm
-channels:
-  - pytorch
-  - defaults
-dependencies:
-  - python=3.9.12
-  - pip=20.3
-  - cudatoolkit=11.3
-  - pytorch=1.11.0
-  - torchvision=0.12.0
-  - numpy=1.19.2
-  - pip:
-    - albumentations==0.4.3
-    - datasets
-    - diffusers
-    - opencv-python==4.6.0.66
-    - pudb==2019.2
-    - invisible-watermark
-    - imageio==2.9.0
-    - imageio-ffmpeg==0.4.2
-    - pytorch-lightning==1.8.0
-    - omegaconf==2.1.1
-    - test-tube>=0.7.5
-    - streamlit>=0.73.1
-    - einops==0.3.0
-    - torch-fidelity==0.3.0
-    - transformers==4.19.2
-    - torchmetrics==0.7.0
-    - kornia==0.6
-    - prefetch_generator
-    - colossalai
-    - -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
-    - -e git+https://github.com/openai/CLIP.git@main#egg=clip
-    - -e .
diff --git a/examples/tutorial/stable_diffusion/ldm/data/__init__.py b/examples/tutorial/stable_diffusion/ldm/data/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/examples/tutorial/stable_diffusion/ldm/data/base.py b/examples/tutorial/stable_diffusion/ldm/data/base.py
deleted file mode 100644
index 4f3cd35714a0..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/data/base.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import math
-from abc import abstractmethod
-
-import torch
-from torch.utils.data import Dataset, ConcatDataset, ChainDataset, IterableDataset
-import os
-import numpy as np
-import cv2
-
-class Txt2ImgIterableBaseDataset(IterableDataset):
-    '''
-    Define an interface to make the IterableDatasets for text2img data chainable
-    '''
-    def __init__(self, file_path: str, rank, world_size):
-        super().__init__()
-        self.file_path = file_path
-        self.folder_list = []
-        self.file_list = []
-        self.txt_list = []
-        self.info = self._get_file_info(file_path)
-        self.start = self.info['start']
-        self.end = self.info['end']
-        self.rank = rank
-
-        self.world_size = world_size
-        # self.per_worker = int(math.floor((self.end - self.start) / float(self.world_size)))
-        # self.iter_start = self.start + self.rank * self.per_worker
-        # self.iter_end = min(self.iter_start + self.per_worker, self.end)
-        # self.num_records = self.iter_end - self.iter_start
-        # self.valid_ids = [i for i in range(self.iter_end)]
-        self.num_records = self.end - self.start
-        self.valid_ids = [i for i in range(self.end)]
-
-        print(f'{self.__class__.__name__} dataset contains {self.__len__()} examples.')
-
-    def __len__(self):
-        # return self.iter_end - self.iter_start
-        return self.end - self.start
-
-    def __iter__(self):
-        sample_iterator = self._sample_generator(self.start, self.end)
-        # sample_iterator = self._sample_generator(self.iter_start, self.iter_end)
-        return sample_iterator
-
-    def _sample_generator(self, start, end):
-        for idx in range(start, end):
-            file_name = self.file_list[idx]
-            txt_name = self.txt_list[idx]
-            f_ = open(txt_name, 'r')
-            txt_ = f_.read()
-            f_.close()
-            image = cv2.imdecode(np.fromfile(file_name, dtype=np.uint8), 1)
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-            image = torch.from_numpy(image) / 255
-            yield {"caption": txt_, "image":image}
-
-
-    def _get_file_info(self, file_path):
-        info = \
-        {
-            "start": 1,
-            "end": 0,
-        }
-        self.folder_list = [file_path + i for i in os.listdir(file_path) if '.' not in i]
-        for folder in self.folder_list:
-            files = [folder + '/' + i for i in os.listdir(folder) if 'jpg' in i]
-            txts = [k.replace('jpg', 'txt') for k in files]
-            self.file_list.extend(files)
-            self.txt_list.extend(txts)
-        info['end'] = len(self.file_list)
-        # with open(file_path, 'r') as fin:
-        #     for _ in enumerate(fin):
-        #         info['end'] += 1
-        # self.txt_list = [k.replace('jpg', 'txt') for k in self.file_list]
-        return info
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/ldm/data/cifar10.py b/examples/tutorial/stable_diffusion/ldm/data/cifar10.py
deleted file mode 100644
index 53cd61263b47..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/data/cifar10.py
+++ /dev/null
@@ -1,184 +0,0 @@
-from typing import Dict
-import numpy as np
-from omegaconf import DictConfig, ListConfig
-import torch
-from torch.utils.data import Dataset
-from pathlib import Path
-import json
-from PIL import Image
-from torchvision import transforms
-from einops import rearrange
-from ldm.util import instantiate_from_config
-from datasets import load_dataset
-
-def make_multi_folder_data(paths, caption_files=None, **kwargs):
-    """Make a concat dataset from multiple folders
-    Don't suport captions yet
-    If paths is a list, that's ok, if it's a Dict interpret it as:
-    k=folder v=n_times to repeat that
-    """
-    list_of_paths = []
-    if isinstance(paths, (Dict, DictConfig)):
-        assert caption_files is None, \
-            "Caption files not yet supported for repeats"
-        for folder_path, repeats in paths.items():
-            list_of_paths.extend([folder_path]*repeats)
-        paths = list_of_paths
-
-    if caption_files is not None:
-        datasets = [FolderData(p, caption_file=c, **kwargs) for (p, c) in zip(paths, caption_files)]
-    else:
-        datasets = [FolderData(p, **kwargs) for p in paths]
-    return torch.utils.data.ConcatDataset(datasets)
-
-class FolderData(Dataset):
-    def __init__(self,
-        root_dir,
-        caption_file=None,
-        image_transforms=[],
-        ext="jpg",
-        default_caption="",
-        postprocess=None,
-        return_paths=False,
-        ) -> None:
-        """Create a dataset from a folder of images.
-        If you pass in a root directory it will be searched for images
-        ending in ext (ext can be a list)
-        """
-        self.root_dir = Path(root_dir)
-        self.default_caption = default_caption
-        self.return_paths = return_paths
-        if isinstance(postprocess, DictConfig):
-            postprocess = instantiate_from_config(postprocess)
-        self.postprocess = postprocess
-        if caption_file is not None:
-            with open(caption_file, "rt") as f:
-                ext = Path(caption_file).suffix.lower()
-                if ext == ".json":
-                    captions = json.load(f)
-                elif ext == ".jsonl":
-                    lines = f.readlines()
-                    lines = [json.loads(x) for x in lines]
-                    captions = {x["file_name"]: x["text"].strip("\n") for x in lines}
-                else:
-                    raise ValueError(f"Unrecognised format: {ext}")
-            self.captions = captions
-        else:
-            self.captions = None
-
-        if not isinstance(ext, (tuple, list, ListConfig)):
-            ext = [ext]
-
-        # Only used if there is no caption file
-        self.paths = []
-        for e in ext:
-            self.paths.extend(list(self.root_dir.rglob(f"*.{e}")))
-        if isinstance(image_transforms, ListConfig):
-            image_transforms = [instantiate_from_config(tt) for tt in image_transforms]
-        image_transforms.extend([transforms.ToTensor(),
-                                 transforms.Lambda(lambda x: rearrange(x * 2. - 1., 'c h w -> h w c'))])
-        image_transforms = transforms.Compose(image_transforms)
-        self.tform = image_transforms
-
-
-    def __len__(self):
-        if self.captions is not None:
-            return len(self.captions.keys())
-        else:
-            return len(self.paths)
-
-    def __getitem__(self, index):
-        data = {}
-        if self.captions is not None:
-            chosen = list(self.captions.keys())[index]
-            caption = self.captions.get(chosen, None)
-            if caption is None:
-                caption = self.default_caption
-            filename = self.root_dir/chosen
-        else:
-            filename = self.paths[index]
-
-        if self.return_paths:
-            data["path"] = str(filename)
-
-        im = Image.open(filename)
-        im = self.process_im(im)
-        data["image"] = im
-
-        if self.captions is not None:
-            data["txt"] = caption
-        else:
-            data["txt"] = self.default_caption
-
-        if self.postprocess is not None:
-            data = self.postprocess(data)
-
-        return data
-
-    def process_im(self, im):
-        im = im.convert("RGB")
-        return self.tform(im)
-
-def hf_dataset(
-    name,
-    image_transforms=[],
-    image_column="img",
-    label_column="label",
-    text_column="txt",
-    split='train',
-    image_key='image',
-    caption_key='txt',
-    ):
-    """Make huggingface dataset with appropriate list of transforms applied
-    """
-    ds = load_dataset(name, split=split)
-    image_transforms = [instantiate_from_config(tt) for tt in image_transforms]
-    image_transforms.extend([transforms.ToTensor(),
-                                transforms.Lambda(lambda x: rearrange(x * 2. - 1., 'c h w -> h w c'))])
-    tform = transforms.Compose(image_transforms)
-
-    assert image_column in ds.column_names, f"Didn't find column {image_column} in {ds.column_names}"
-    assert label_column in ds.column_names, f"Didn't find column {label_column} in {ds.column_names}"
-
-    def pre_process(examples):
-        processed = {}
-        processed[image_key] = [tform(im) for im in examples[image_column]]
-
-        label_to_text_dict = {0: "airplane", 1: "automobile", 2: "bird", 3: "cat", 4: "deer", 5: "dog", 6: "frog", 7: "horse", 8: "ship", 9: "truck"}
-
-        processed[caption_key] = [label_to_text_dict[label] for label in examples[label_column]]
-
-        return processed
-
-    ds.set_transform(pre_process)
-    return ds
-
-class TextOnly(Dataset):
-    def __init__(self, captions, output_size, image_key="image", caption_key="txt", n_gpus=1):
-        """Returns only captions with dummy images"""
-        self.output_size = output_size
-        self.image_key = image_key
-        self.caption_key = caption_key
-        if isinstance(captions, Path):
-            self.captions = self._load_caption_file(captions)
-        else:
-            self.captions = captions
-
-        if n_gpus > 1:
-            # hack to make sure that all the captions appear on each gpu
-            repeated = [n_gpus*[x] for x in self.captions]
-            self.captions = []
-            [self.captions.extend(x) for x in repeated]
-
-    def __len__(self):
-        return len(self.captions)
-
-    def __getitem__(self, index):
-        dummy_im = torch.zeros(3, self.output_size, self.output_size)
-        dummy_im = rearrange(dummy_im * 2. - 1., 'c h w -> h w c')
-        return {self.image_key: dummy_im, self.caption_key: self.captions[index]}
-
-    def _load_caption_file(self, filename):
-        with open(filename, 'rt') as f:
-            captions = f.readlines()
-        return [x.strip('\n') for x in captions]
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/ldm/data/imagenet.py b/examples/tutorial/stable_diffusion/ldm/data/imagenet.py
deleted file mode 100644
index 1c473f9c6965..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/data/imagenet.py
+++ /dev/null
@@ -1,394 +0,0 @@
-import os, yaml, pickle, shutil, tarfile, glob
-import cv2
-import albumentations
-import PIL
-import numpy as np
-import torchvision.transforms.functional as TF
-from omegaconf import OmegaConf
-from functools import partial
-from PIL import Image
-from tqdm import tqdm
-from torch.utils.data import Dataset, Subset
-
-import taming.data.utils as tdu
-from taming.data.imagenet import str_to_indices, give_synsets_from_indices, download, retrieve
-from taming.data.imagenet import ImagePaths
-
-from ldm.modules.image_degradation import degradation_fn_bsr, degradation_fn_bsr_light
-
-
-def synset2idx(path_to_yaml="data/index_synset.yaml"):
-    with open(path_to_yaml) as f:
-        di2s = yaml.load(f)
-    return dict((v,k) for k,v in di2s.items())
-
-
-class ImageNetBase(Dataset):
-    def __init__(self, config=None):
-        self.config = config or OmegaConf.create()
-        if not type(self.config)==dict:
-            self.config = OmegaConf.to_container(self.config)
-        self.keep_orig_class_label = self.config.get("keep_orig_class_label", False)
-        self.process_images = True  # if False we skip loading & processing images and self.data contains filepaths
-        self._prepare()
-        self._prepare_synset_to_human()
-        self._prepare_idx_to_synset()
-        self._prepare_human_to_integer_label()
-        self._load()
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(self, i):
-        return self.data[i]
-
-    def _prepare(self):
-        raise NotImplementedError()
-
-    def _filter_relpaths(self, relpaths):
-        ignore = set([
-            "n06596364_9591.JPEG",
-        ])
-        relpaths = [rpath for rpath in relpaths if not rpath.split("/")[-1] in ignore]
-        if "sub_indices" in self.config:
-            indices = str_to_indices(self.config["sub_indices"])
-            synsets = give_synsets_from_indices(indices, path_to_yaml=self.idx2syn)  # returns a list of strings
-            self.synset2idx = synset2idx(path_to_yaml=self.idx2syn)
-            files = []
-            for rpath in relpaths:
-                syn = rpath.split("/")[0]
-                if syn in synsets:
-                    files.append(rpath)
-            return files
-        else:
-            return relpaths
-
-    def _prepare_synset_to_human(self):
-        SIZE = 2655750
-        URL = "https://heibox.uni-heidelberg.de/f/9f28e956cd304264bb82/?dl=1"
-        self.human_dict = os.path.join(self.root, "synset_human.txt")
-        if (not os.path.exists(self.human_dict) or
-                not os.path.getsize(self.human_dict)==SIZE):
-            download(URL, self.human_dict)
-
-    def _prepare_idx_to_synset(self):
-        URL = "https://heibox.uni-heidelberg.de/f/d835d5b6ceda4d3aa910/?dl=1"
-        self.idx2syn = os.path.join(self.root, "index_synset.yaml")
-        if (not os.path.exists(self.idx2syn)):
-            download(URL, self.idx2syn)
-
-    def _prepare_human_to_integer_label(self):
-        URL = "https://heibox.uni-heidelberg.de/f/2362b797d5be43b883f6/?dl=1"
-        self.human2integer = os.path.join(self.root, "imagenet1000_clsidx_to_labels.txt")
-        if (not os.path.exists(self.human2integer)):
-            download(URL, self.human2integer)
-        with open(self.human2integer, "r") as f:
-            lines = f.read().splitlines()
-            assert len(lines) == 1000
-            self.human2integer_dict = dict()
-            for line in lines:
-                value, key = line.split(":")
-                self.human2integer_dict[key] = int(value)
-
-    def _load(self):
-        with open(self.txt_filelist, "r") as f:
-            self.relpaths = f.read().splitlines()
-            l1 = len(self.relpaths)
-            self.relpaths = self._filter_relpaths(self.relpaths)
-            print("Removed {} files from filelist during filtering.".format(l1 - len(self.relpaths)))
-
-        self.synsets = [p.split("/")[0] for p in self.relpaths]
-        self.abspaths = [os.path.join(self.datadir, p) for p in self.relpaths]
-
-        unique_synsets = np.unique(self.synsets)
-        class_dict = dict((synset, i) for i, synset in enumerate(unique_synsets))
-        if not self.keep_orig_class_label:
-            self.class_labels = [class_dict[s] for s in self.synsets]
-        else:
-            self.class_labels = [self.synset2idx[s] for s in self.synsets]
-
-        with open(self.human_dict, "r") as f:
-            human_dict = f.read().splitlines()
-            human_dict = dict(line.split(maxsplit=1) for line in human_dict)
-
-        self.human_labels = [human_dict[s] for s in self.synsets]
-
-        labels = {
-            "relpath": np.array(self.relpaths),
-            "synsets": np.array(self.synsets),
-            "class_label": np.array(self.class_labels),
-            "human_label": np.array(self.human_labels),
-        }
-
-        if self.process_images:
-            self.size = retrieve(self.config, "size", default=256)
-            self.data = ImagePaths(self.abspaths,
-                                   labels=labels,
-                                   size=self.size,
-                                   random_crop=self.random_crop,
-                                   )
-        else:
-            self.data = self.abspaths
-
-
-class ImageNetTrain(ImageNetBase):
-    NAME = "ILSVRC2012_train"
-    URL = "http://www.image-net.org/challenges/LSVRC/2012/"
-    AT_HASH = "a306397ccf9c2ead27155983c254227c0fd938e2"
-    FILES = [
-        "ILSVRC2012_img_train.tar",
-    ]
-    SIZES = [
-        147897477120,
-    ]
-
-    def __init__(self, process_images=True, data_root=None, **kwargs):
-        self.process_images = process_images
-        self.data_root = data_root
-        super().__init__(**kwargs)
-
-    def _prepare(self):
-        if self.data_root:
-            self.root = os.path.join(self.data_root, self.NAME)
-        else:
-            cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
-            self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
-
-        self.datadir = os.path.join(self.root, "data")
-        self.txt_filelist = os.path.join(self.root, "filelist.txt")
-        self.expected_length = 1281167
-        self.random_crop = retrieve(self.config, "ImageNetTrain/random_crop",
-                                    default=True)
-        if not tdu.is_prepared(self.root):
-            # prep
-            print("Preparing dataset {} in {}".format(self.NAME, self.root))
-
-            datadir = self.datadir
-            if not os.path.exists(datadir):
-                path = os.path.join(self.root, self.FILES[0])
-                if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
-                    import academictorrents as at
-                    atpath = at.get(self.AT_HASH, datastore=self.root)
-                    assert atpath == path
-
-                print("Extracting {} to {}".format(path, datadir))
-                os.makedirs(datadir, exist_ok=True)
-                with tarfile.open(path, "r:") as tar:
-                    tar.extractall(path=datadir)
-
-                print("Extracting sub-tars.")
-                subpaths = sorted(glob.glob(os.path.join(datadir, "*.tar")))
-                for subpath in tqdm(subpaths):
-                    subdir = subpath[:-len(".tar")]
-                    os.makedirs(subdir, exist_ok=True)
-                    with tarfile.open(subpath, "r:") as tar:
-                        tar.extractall(path=subdir)
-
-            filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
-            filelist = [os.path.relpath(p, start=datadir) for p in filelist]
-            filelist = sorted(filelist)
-            filelist = "\n".join(filelist)+"\n"
-            with open(self.txt_filelist, "w") as f:
-                f.write(filelist)
-
-            tdu.mark_prepared(self.root)
-
-
-class ImageNetValidation(ImageNetBase):
-    NAME = "ILSVRC2012_validation"
-    URL = "http://www.image-net.org/challenges/LSVRC/2012/"
-    AT_HASH = "5d6d0df7ed81efd49ca99ea4737e0ae5e3a5f2e5"
-    VS_URL = "https://heibox.uni-heidelberg.de/f/3e0f6e9c624e45f2bd73/?dl=1"
-    FILES = [
-        "ILSVRC2012_img_val.tar",
-        "validation_synset.txt",
-    ]
-    SIZES = [
-        6744924160,
-        1950000,
-    ]
-
-    def __init__(self, process_images=True, data_root=None, **kwargs):
-        self.data_root = data_root
-        self.process_images = process_images
-        super().__init__(**kwargs)
-
-    def _prepare(self):
-        if self.data_root:
-            self.root = os.path.join(self.data_root, self.NAME)
-        else:
-            cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
-            self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
-        self.datadir = os.path.join(self.root, "data")
-        self.txt_filelist = os.path.join(self.root, "filelist.txt")
-        self.expected_length = 50000
-        self.random_crop = retrieve(self.config, "ImageNetValidation/random_crop",
-                                    default=False)
-        if not tdu.is_prepared(self.root):
-            # prep
-            print("Preparing dataset {} in {}".format(self.NAME, self.root))
-
-            datadir = self.datadir
-            if not os.path.exists(datadir):
-                path = os.path.join(self.root, self.FILES[0])
-                if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
-                    import academictorrents as at
-                    atpath = at.get(self.AT_HASH, datastore=self.root)
-                    assert atpath == path
-
-                print("Extracting {} to {}".format(path, datadir))
-                os.makedirs(datadir, exist_ok=True)
-                with tarfile.open(path, "r:") as tar:
-                    tar.extractall(path=datadir)
-
-                vspath = os.path.join(self.root, self.FILES[1])
-                if not os.path.exists(vspath) or not os.path.getsize(vspath)==self.SIZES[1]:
-                    download(self.VS_URL, vspath)
-
-                with open(vspath, "r") as f:
-                    synset_dict = f.read().splitlines()
-                    synset_dict = dict(line.split() for line in synset_dict)
-
-                print("Reorganizing into synset folders")
-                synsets = np.unique(list(synset_dict.values()))
-                for s in synsets:
-                    os.makedirs(os.path.join(datadir, s), exist_ok=True)
-                for k, v in synset_dict.items():
-                    src = os.path.join(datadir, k)
-                    dst = os.path.join(datadir, v)
-                    shutil.move(src, dst)
-
-            filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
-            filelist = [os.path.relpath(p, start=datadir) for p in filelist]
-            filelist = sorted(filelist)
-            filelist = "\n".join(filelist)+"\n"
-            with open(self.txt_filelist, "w") as f:
-                f.write(filelist)
-
-            tdu.mark_prepared(self.root)
-
-
-
-class ImageNetSR(Dataset):
-    def __init__(self, size=None,
-                 degradation=None, downscale_f=4, min_crop_f=0.5, max_crop_f=1.,
-                 random_crop=True):
-        """
-        Imagenet Superresolution Dataloader
-        Performs following ops in order:
-        1.  crops a crop of size s from image either as random or center crop
-        2.  resizes crop to size with cv2.area_interpolation
-        3.  degrades resized crop with degradation_fn
-
-        :param size: resizing to size after cropping
-        :param degradation: degradation_fn, e.g. cv_bicubic or bsrgan_light
-        :param downscale_f: Low Resolution Downsample factor
-        :param min_crop_f: determines crop size s,
-          where s = c * min_img_side_len with c sampled from interval (min_crop_f, max_crop_f)
-        :param max_crop_f: ""
-        :param data_root:
-        :param random_crop:
-        """
-        self.base = self.get_base()
-        assert size
-        assert (size / downscale_f).is_integer()
-        self.size = size
-        self.LR_size = int(size / downscale_f)
-        self.min_crop_f = min_crop_f
-        self.max_crop_f = max_crop_f
-        assert(max_crop_f <= 1.)
-        self.center_crop = not random_crop
-
-        self.image_rescaler = albumentations.SmallestMaxSize(max_size=size, interpolation=cv2.INTER_AREA)
-
-        self.pil_interpolation = False # gets reset later if incase interp_op is from pillow
-
-        if degradation == "bsrgan":
-            self.degradation_process = partial(degradation_fn_bsr, sf=downscale_f)
-
-        elif degradation == "bsrgan_light":
-            self.degradation_process = partial(degradation_fn_bsr_light, sf=downscale_f)
-
-        else:
-            interpolation_fn = {
-            "cv_nearest": cv2.INTER_NEAREST,
-            "cv_bilinear": cv2.INTER_LINEAR,
-            "cv_bicubic": cv2.INTER_CUBIC,
-            "cv_area": cv2.INTER_AREA,
-            "cv_lanczos": cv2.INTER_LANCZOS4,
-            "pil_nearest": PIL.Image.NEAREST,
-            "pil_bilinear": PIL.Image.BILINEAR,
-            "pil_bicubic": PIL.Image.BICUBIC,
-            "pil_box": PIL.Image.BOX,
-            "pil_hamming": PIL.Image.HAMMING,
-            "pil_lanczos": PIL.Image.LANCZOS,
-            }[degradation]
-
-            self.pil_interpolation = degradation.startswith("pil_")
-
-            if self.pil_interpolation:
-                self.degradation_process = partial(TF.resize, size=self.LR_size, interpolation=interpolation_fn)
-
-            else:
-                self.degradation_process = albumentations.SmallestMaxSize(max_size=self.LR_size,
-                                                                          interpolation=interpolation_fn)
-
-    def __len__(self):
-        return len(self.base)
-
-    def __getitem__(self, i):
-        example = self.base[i]
-        image = Image.open(example["file_path_"])
-
-        if not image.mode == "RGB":
-            image = image.convert("RGB")
-
-        image = np.array(image).astype(np.uint8)
-
-        min_side_len = min(image.shape[:2])
-        crop_side_len = min_side_len * np.random.uniform(self.min_crop_f, self.max_crop_f, size=None)
-        crop_side_len = int(crop_side_len)
-
-        if self.center_crop:
-            self.cropper = albumentations.CenterCrop(height=crop_side_len, width=crop_side_len)
-
-        else:
-            self.cropper = albumentations.RandomCrop(height=crop_side_len, width=crop_side_len)
-
-        image = self.cropper(image=image)["image"]
-        image = self.image_rescaler(image=image)["image"]
-
-        if self.pil_interpolation:
-            image_pil = PIL.Image.fromarray(image)
-            LR_image = self.degradation_process(image_pil)
-            LR_image = np.array(LR_image).astype(np.uint8)
-
-        else:
-            LR_image = self.degradation_process(image=image)["image"]
-
-        example["image"] = (image/127.5 - 1.0).astype(np.float32)
-        example["LR_image"] = (LR_image/127.5 - 1.0).astype(np.float32)
-
-        return example
-
-
-class ImageNetSRTrain(ImageNetSR):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-    def get_base(self):
-        with open("data/imagenet_train_hr_indices.p", "rb") as f:
-            indices = pickle.load(f)
-        dset = ImageNetTrain(process_images=False,)
-        return Subset(dset, indices)
-
-
-class ImageNetSRValidation(ImageNetSR):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-    def get_base(self):
-        with open("data/imagenet_val_hr_indices.p", "rb") as f:
-            indices = pickle.load(f)
-        dset = ImageNetValidation(process_images=False,)
-        return Subset(dset, indices)
diff --git a/examples/tutorial/stable_diffusion/ldm/data/lsun.py b/examples/tutorial/stable_diffusion/ldm/data/lsun.py
deleted file mode 100644
index 6256e45715ff..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/data/lsun.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import os
-import numpy as np
-import PIL
-from PIL import Image
-from torch.utils.data import Dataset
-from torchvision import transforms
-
-
-class LSUNBase(Dataset):
-    def __init__(self,
-                 txt_file,
-                 data_root,
-                 size=None,
-                 interpolation="bicubic",
-                 flip_p=0.5
-                 ):
-        self.data_paths = txt_file
-        self.data_root = data_root
-        with open(self.data_paths, "r") as f:
-            self.image_paths = f.read().splitlines()
-        self._length = len(self.image_paths)
-        self.labels = {
-            "relative_file_path_": [l for l in self.image_paths],
-            "file_path_": [os.path.join(self.data_root, l)
-                           for l in self.image_paths],
-        }
-
-        self.size = size
-        self.interpolation = {"linear": PIL.Image.LINEAR,
-                              "bilinear": PIL.Image.BILINEAR,
-                              "bicubic": PIL.Image.BICUBIC,
-                              "lanczos": PIL.Image.LANCZOS,
-                              }[interpolation]
-        self.flip = transforms.RandomHorizontalFlip(p=flip_p)
-
-    def __len__(self):
-        return self._length
-
-    def __getitem__(self, i):
-        example = dict((k, self.labels[k][i]) for k in self.labels)
-        image = Image.open(example["file_path_"])
-        if not image.mode == "RGB":
-            image = image.convert("RGB")
-
-        # default to score-sde preprocessing
-        img = np.array(image).astype(np.uint8)
-        crop = min(img.shape[0], img.shape[1])
-        h, w, = img.shape[0], img.shape[1]
-        img = img[(h - crop) // 2:(h + crop) // 2,
-              (w - crop) // 2:(w + crop) // 2]
-
-        image = Image.fromarray(img)
-        if self.size is not None:
-            image = image.resize((self.size, self.size), resample=self.interpolation)
-
-        image = self.flip(image)
-        image = np.array(image).astype(np.uint8)
-        example["image"] = (image / 127.5 - 1.0).astype(np.float32)
-        return example
-
-
-class LSUNChurchesTrain(LSUNBase):
-    def __init__(self, **kwargs):
-        super().__init__(txt_file="data/lsun/church_outdoor_train.txt", data_root="data/lsun/churches", **kwargs)
-
-
-class LSUNChurchesValidation(LSUNBase):
-    def __init__(self, flip_p=0., **kwargs):
-        super().__init__(txt_file="data/lsun/church_outdoor_val.txt", data_root="data/lsun/churches",
-                         flip_p=flip_p, **kwargs)
-
-
-class LSUNBedroomsTrain(LSUNBase):
-    def __init__(self, **kwargs):
-        super().__init__(txt_file="data/lsun/bedrooms_train.txt", data_root="data/lsun/bedrooms", **kwargs)
-
-
-class LSUNBedroomsValidation(LSUNBase):
-    def __init__(self, flip_p=0.0, **kwargs):
-        super().__init__(txt_file="data/lsun/bedrooms_val.txt", data_root="data/lsun/bedrooms",
-                         flip_p=flip_p, **kwargs)
-
-
-class LSUNCatsTrain(LSUNBase):
-    def __init__(self, **kwargs):
-        super().__init__(txt_file="data/lsun/cat_train.txt", data_root="data/lsun/cats", **kwargs)
-
-
-class LSUNCatsValidation(LSUNBase):
-    def __init__(self, flip_p=0., **kwargs):
-        super().__init__(txt_file="data/lsun/cat_val.txt", data_root="data/lsun/cats",
-                         flip_p=flip_p, **kwargs)
diff --git a/examples/tutorial/stable_diffusion/ldm/lr_scheduler.py b/examples/tutorial/stable_diffusion/ldm/lr_scheduler.py
deleted file mode 100644
index be39da9ca6da..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/lr_scheduler.py
+++ /dev/null
@@ -1,98 +0,0 @@
-import numpy as np
-
-
-class LambdaWarmUpCosineScheduler:
-    """
-    note: use with a base_lr of 1.0
-    """
-    def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0):
-        self.lr_warm_up_steps = warm_up_steps
-        self.lr_start = lr_start
-        self.lr_min = lr_min
-        self.lr_max = lr_max
-        self.lr_max_decay_steps = max_decay_steps
-        self.last_lr = 0.
-        self.verbosity_interval = verbosity_interval
-
-    def schedule(self, n, **kwargs):
-        if self.verbosity_interval > 0:
-            if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_lr}")
-        if n < self.lr_warm_up_steps:
-            lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start
-            self.last_lr = lr
-            return lr
-        else:
-            t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps)
-            t = min(t, 1.0)
-            lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * (
-                    1 + np.cos(t * np.pi))
-            self.last_lr = lr
-            return lr
-
-    def __call__(self, n, **kwargs):
-        return self.schedule(n,**kwargs)
-
-
-class LambdaWarmUpCosineScheduler2:
-    """
-    supports repeated iterations, configurable via lists
-    note: use with a base_lr of 1.0.
-    """
-    def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0):
-        assert len(warm_up_steps) == len(f_min) == len(f_max) == len(f_start) == len(cycle_lengths)
-        self.lr_warm_up_steps = warm_up_steps
-        self.f_start = f_start
-        self.f_min = f_min
-        self.f_max = f_max
-        self.cycle_lengths = cycle_lengths
-        self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths))
-        self.last_f = 0.
-        self.verbosity_interval = verbosity_interval
-
-    def find_in_interval(self, n):
-        interval = 0
-        for cl in self.cum_cycles[1:]:
-            if n <= cl:
-                return interval
-            interval += 1
-
-    def schedule(self, n, **kwargs):
-        cycle = self.find_in_interval(n)
-        n = n - self.cum_cycles[cycle]
-        if self.verbosity_interval > 0:
-            if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
-                                                       f"current cycle {cycle}")
-        if n < self.lr_warm_up_steps[cycle]:
-            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
-            self.last_f = f
-            return f
-        else:
-            t = (n - self.lr_warm_up_steps[cycle]) / (self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle])
-            t = min(t, 1.0)
-            f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * (
-                    1 + np.cos(t * np.pi))
-            self.last_f = f
-            return f
-
-    def __call__(self, n, **kwargs):
-        return self.schedule(n, **kwargs)
-
-
-class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):
-
-    def schedule(self, n, **kwargs):
-        cycle = self.find_in_interval(n)
-        n = n - self.cum_cycles[cycle]
-        if self.verbosity_interval > 0:
-            if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
-                                                       f"current cycle {cycle}")
-
-        if n < self.lr_warm_up_steps[cycle]:
-            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
-            self.last_f = f
-            return f
-        else:
-            f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (self.cycle_lengths[cycle] - n) / (self.cycle_lengths[cycle])
-            self.last_f = f
-            return f
-
diff --git a/examples/tutorial/stable_diffusion/ldm/models/autoencoder.py b/examples/tutorial/stable_diffusion/ldm/models/autoencoder.py
deleted file mode 100644
index 873d8b69bd22..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/models/autoencoder.py
+++ /dev/null
@@ -1,544 +0,0 @@
-import torch
-import pytorch_lightning as pl
-import torch.nn.functional as F
-from contextlib import contextmanager
-
-from taming.modules.vqvae.quantize import VectorQuantizer2 as VectorQuantizer
-
-from ldm.modules.diffusionmodules.model import Encoder, Decoder
-from ldm.modules.distributions.distributions import DiagonalGaussianDistribution
-
-from ldm.util import instantiate_from_config
-
-
-class VQModel(pl.LightningModule):
-    def __init__(self,
-                 ddconfig,
-                 lossconfig,
-                 n_embed,
-                 embed_dim,
-                 ckpt_path=None,
-                 ignore_keys=[],
-                 image_key="image",
-                 colorize_nlabels=None,
-                 monitor=None,
-                 batch_resize_range=None,
-                 scheduler_config=None,
-                 lr_g_factor=1.0,
-                 remap=None,
-                 sane_index_shape=False, # tell vector quantizer to return indices as bhw
-                 use_ema=False
-                 ):
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.n_embed = n_embed
-        self.image_key = image_key
-        self.encoder = Encoder(**ddconfig)
-        self.decoder = Decoder(**ddconfig)
-        self.loss = instantiate_from_config(lossconfig)
-        self.quantize = VectorQuantizer(n_embed, embed_dim, beta=0.25,
-                                        remap=remap,
-                                        sane_index_shape=sane_index_shape)
-        self.quant_conv = torch.nn.Conv2d(ddconfig["z_channels"], embed_dim, 1)
-        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
-        if colorize_nlabels is not None:
-            assert type(colorize_nlabels)==int
-            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
-        if monitor is not None:
-            self.monitor = monitor
-        self.batch_resize_range = batch_resize_range
-        if self.batch_resize_range is not None:
-            print(f"{self.__class__.__name__}: Using per-batch resizing in range {batch_resize_range}.")
-
-        self.use_ema = use_ema
-        if self.use_ema:
-            self.model_ema = LitEma(self)
-            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
-
-        if ckpt_path is not None:
-            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
-        self.scheduler_config = scheduler_config
-        self.lr_g_factor = lr_g_factor
-
-    @contextmanager
-    def ema_scope(self, context=None):
-        if self.use_ema:
-            self.model_ema.store(self.parameters())
-            self.model_ema.copy_to(self)
-            if context is not None:
-                print(f"{context}: Switched to EMA weights")
-        try:
-            yield None
-        finally:
-            if self.use_ema:
-                self.model_ema.restore(self.parameters())
-                if context is not None:
-                    print(f"{context}: Restored training weights")
-
-    def init_from_ckpt(self, path, ignore_keys=list()):
-        sd = torch.load(path, map_location="cpu")["state_dict"]
-        keys = list(sd.keys())
-        for k in keys:
-            for ik in ignore_keys:
-                if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
-                    del sd[k]
-        missing, unexpected = self.load_state_dict(sd, strict=False)
-        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
-        if len(missing) > 0:
-            print(f"Missing Keys: {missing}")
-            print(f"Unexpected Keys: {unexpected}")
-
-    def on_train_batch_end(self, *args, **kwargs):
-        if self.use_ema:
-            self.model_ema(self)
-
-    def encode(self, x):
-        h = self.encoder(x)
-        h = self.quant_conv(h)
-        quant, emb_loss, info = self.quantize(h)
-        return quant, emb_loss, info
-
-    def encode_to_prequant(self, x):
-        h = self.encoder(x)
-        h = self.quant_conv(h)
-        return h
-
-    def decode(self, quant):
-        quant = self.post_quant_conv(quant)
-        dec = self.decoder(quant)
-        return dec
-
-    def decode_code(self, code_b):
-        quant_b = self.quantize.embed_code(code_b)
-        dec = self.decode(quant_b)
-        return dec
-
-    def forward(self, input, return_pred_indices=False):
-        quant, diff, (_,_,ind) = self.encode(input)
-        dec = self.decode(quant)
-        if return_pred_indices:
-            return dec, diff, ind
-        return dec, diff
-
-    def get_input(self, batch, k):
-        x = batch[k]
-        if len(x.shape) == 3:
-            x = x[..., None]
-        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
-        if self.batch_resize_range is not None:
-            lower_size = self.batch_resize_range[0]
-            upper_size = self.batch_resize_range[1]
-            if self.global_step <= 4:
-                # do the first few batches with max size to avoid later oom
-                new_resize = upper_size
-            else:
-                new_resize = np.random.choice(np.arange(lower_size, upper_size+16, 16))
-            if new_resize != x.shape[2]:
-                x = F.interpolate(x, size=new_resize, mode="bicubic")
-            x = x.detach()
-        return x
-
-    def training_step(self, batch, batch_idx, optimizer_idx):
-        # https://github.com/pytorch/pytorch/issues/37142
-        # try not to fool the heuristics
-        x = self.get_input(batch, self.image_key)
-        xrec, qloss, ind = self(x, return_pred_indices=True)
-
-        if optimizer_idx == 0:
-            # autoencode
-            aeloss, log_dict_ae = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
-                                            last_layer=self.get_last_layer(), split="train",
-                                            predicted_indices=ind)
-
-            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
-            return aeloss
-
-        if optimizer_idx == 1:
-            # discriminator
-            discloss, log_dict_disc = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
-                                            last_layer=self.get_last_layer(), split="train")
-            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=True)
-            return discloss
-
-    def validation_step(self, batch, batch_idx):
-        log_dict = self._validation_step(batch, batch_idx)
-        with self.ema_scope():
-            log_dict_ema = self._validation_step(batch, batch_idx, suffix="_ema")
-        return log_dict
-
-    def _validation_step(self, batch, batch_idx, suffix=""):
-        x = self.get_input(batch, self.image_key)
-        xrec, qloss, ind = self(x, return_pred_indices=True)
-        aeloss, log_dict_ae = self.loss(qloss, x, xrec, 0,
-                                        self.global_step,
-                                        last_layer=self.get_last_layer(),
-                                        split="val"+suffix,
-                                        predicted_indices=ind
-                                        )
-
-        discloss, log_dict_disc = self.loss(qloss, x, xrec, 1,
-                                            self.global_step,
-                                            last_layer=self.get_last_layer(),
-                                            split="val"+suffix,
-                                            predicted_indices=ind
-                                            )
-        rec_loss = log_dict_ae[f"val{suffix}/rec_loss"]
-        self.log(f"val{suffix}/rec_loss", rec_loss,
-                   prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
-        self.log(f"val{suffix}/aeloss", aeloss,
-                   prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
-        if version.parse(pl.__version__) >= version.parse('1.4.0'):
-            del log_dict_ae[f"val{suffix}/rec_loss"]
-        self.log_dict(log_dict_ae)
-        self.log_dict(log_dict_disc)
-        return self.log_dict
-
-    def configure_optimizers(self):
-        lr_d = self.learning_rate
-        lr_g = self.lr_g_factor*self.learning_rate
-        print("lr_d", lr_d)
-        print("lr_g", lr_g)
-        opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
-                                  list(self.decoder.parameters())+
-                                  list(self.quantize.parameters())+
-                                  list(self.quant_conv.parameters())+
-                                  list(self.post_quant_conv.parameters()),
-                                  lr=lr_g, betas=(0.5, 0.9))
-        opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
-                                    lr=lr_d, betas=(0.5, 0.9))
-
-        if self.scheduler_config is not None:
-            scheduler = instantiate_from_config(self.scheduler_config)
-
-            print("Setting up LambdaLR scheduler...")
-            scheduler = [
-                {
-                    'scheduler': LambdaLR(opt_ae, lr_lambda=scheduler.schedule),
-                    'interval': 'step',
-                    'frequency': 1
-                },
-                {
-                    'scheduler': LambdaLR(opt_disc, lr_lambda=scheduler.schedule),
-                    'interval': 'step',
-                    'frequency': 1
-                },
-            ]
-            return [opt_ae, opt_disc], scheduler
-        return [opt_ae, opt_disc], []
-
-    def get_last_layer(self):
-        return self.decoder.conv_out.weight
-
-    def log_images(self, batch, only_inputs=False, plot_ema=False, **kwargs):
-        log = dict()
-        x = self.get_input(batch, self.image_key)
-        x = x.to(self.device)
-        if only_inputs:
-            log["inputs"] = x
-            return log
-        xrec, _ = self(x)
-        if x.shape[1] > 3:
-            # colorize with random projection
-            assert xrec.shape[1] > 3
-            x = self.to_rgb(x)
-            xrec = self.to_rgb(xrec)
-        log["inputs"] = x
-        log["reconstructions"] = xrec
-        if plot_ema:
-            with self.ema_scope():
-                xrec_ema, _ = self(x)
-                if x.shape[1] > 3: xrec_ema = self.to_rgb(xrec_ema)
-                log["reconstructions_ema"] = xrec_ema
-        return log
-
-    def to_rgb(self, x):
-        assert self.image_key == "segmentation"
-        if not hasattr(self, "colorize"):
-            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
-        x = F.conv2d(x, weight=self.colorize)
-        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
-        return x
-
-
-class VQModelInterface(VQModel):
-    def __init__(self, embed_dim, *args, **kwargs):
-        super().__init__(embed_dim=embed_dim, *args, **kwargs)
-        self.embed_dim = embed_dim
-
-    def encode(self, x):
-        h = self.encoder(x)
-        h = self.quant_conv(h)
-        return h
-
-    def decode(self, h, force_not_quantize=False):
-        # also go through quantization layer
-        if not force_not_quantize:
-            quant, emb_loss, info = self.quantize(h)
-        else:
-            quant = h
-        quant = self.post_quant_conv(quant)
-        dec = self.decoder(quant)
-        return dec
-
-
-class AutoencoderKL(pl.LightningModule):
-    def __init__(self,
-                 ddconfig,
-                 lossconfig,
-                 embed_dim,
-                 ckpt_path=None,
-                 ignore_keys=[],
-                 image_key="image",
-                 colorize_nlabels=None,
-                 monitor=None,
-                 from_pretrained: str=None
-                 ):
-        super().__init__()
-        self.image_key = image_key
-        self.encoder = Encoder(**ddconfig)
-        self.decoder = Decoder(**ddconfig)
-        self.loss = instantiate_from_config(lossconfig)
-        assert ddconfig["double_z"]
-        self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
-        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
-        self.embed_dim = embed_dim
-        if colorize_nlabels is not None:
-            assert type(colorize_nlabels)==int
-            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
-        if monitor is not None:
-            self.monitor = monitor
-        if ckpt_path is not None:
-            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
-        from diffusers.modeling_utils import load_state_dict
-        if from_pretrained is not None:
-            state_dict = load_state_dict(from_pretrained)
-            self._load_pretrained_model(state_dict)
-
-    def _state_key_mapping(self, state_dict: dict):
-        import re
-        res_dict = {}
-        key_list = state_dict.keys()
-        key_str = " ".join(key_list)
-        up_block_pattern = re.compile('upsamplers')
-        p1 = re.compile('mid.block_[0-9]')
-        p2 = re.compile('decoder.up.[0-9]')
-        up_blocks_count = int(len(re.findall(up_block_pattern, key_str)) / 2 + 1)
-        for key_, val_ in state_dict.items():
-            key_ = key_.replace("up_blocks", "up").replace("down_blocks", "down").replace('resnets', 'block')\
-                .replace('mid_block', 'mid').replace("mid.block.", "mid.block_")\
-                .replace('mid.attentions.0.key', 'mid.attn_1.k')\
-                .replace('mid.attentions.0.query', 'mid.attn_1.q') \
-                .replace('mid.attentions.0.value', 'mid.attn_1.v') \
-                .replace('mid.attentions.0.group_norm', 'mid.attn_1.norm') \
-                .replace('mid.attentions.0.proj_attn', 'mid.attn_1.proj_out')\
-                .replace('upsamplers.0', 'upsample')\
-                .replace('downsamplers.0', 'downsample')\
-                .replace('conv_shortcut', 'nin_shortcut')\
-                .replace('conv_norm_out', 'norm_out')
-
-            mid_list = re.findall(p1, key_)
-            if len(mid_list) != 0:
-                mid_str = mid_list[0]
-                mid_id = int(mid_str[-1]) + 1
-                key_ = key_.replace(mid_str, mid_str[:-1] + str(mid_id))
-
-            up_list = re.findall(p2, key_)
-            if len(up_list) != 0:
-                up_str = up_list[0]
-                up_id = up_blocks_count - 1 -int(up_str[-1])
-                key_ = key_.replace(up_str, up_str[:-1] + str(up_id))
-            res_dict[key_] = val_
-        return res_dict
-
-    def _load_pretrained_model(self, state_dict, ignore_mismatched_sizes=False):
-        state_dict = self._state_key_mapping(state_dict)
-        model_state_dict = self.state_dict()
-        loaded_keys = [k for k in state_dict.keys()]
-        expected_keys = list(model_state_dict.keys())
-        original_loaded_keys = loaded_keys
-        missing_keys = list(set(expected_keys) - set(loaded_keys))
-        unexpected_keys = list(set(loaded_keys) - set(expected_keys))
-
-        def _find_mismatched_keys(
-            state_dict,
-            model_state_dict,
-            loaded_keys,
-            ignore_mismatched_sizes,
-        ):
-            mismatched_keys = []
-            if ignore_mismatched_sizes:
-                for checkpoint_key in loaded_keys:
-                    model_key = checkpoint_key
-
-                    if (
-                        model_key in model_state_dict
-                        and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape
-                    ):
-                        mismatched_keys.append(
-                            (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
-                        )
-                        del state_dict[checkpoint_key]
-            return mismatched_keys
-        if state_dict is not None:
-            # Whole checkpoint
-            mismatched_keys = _find_mismatched_keys(
-                state_dict,
-                model_state_dict,
-                original_loaded_keys,
-                ignore_mismatched_sizes,
-            )
-            error_msgs = self._load_state_dict_into_model(state_dict)
-        return missing_keys, unexpected_keys, mismatched_keys, error_msgs
-
-    def _load_state_dict_into_model(self, state_dict):
-        # Convert old format to new format if needed from a PyTorch state_dict
-        # copy state_dict so _load_from_state_dict can modify it
-        state_dict = state_dict.copy()
-        error_msgs = []
-
-        # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
-        # so we need to apply the function recursively.
-        def load(module: torch.nn.Module, prefix=""):
-            args = (state_dict, prefix, {}, True, [], [], error_msgs)
-            module._load_from_state_dict(*args)
-
-            for name, child in module._modules.items():
-                if child is not None:
-                    load(child, prefix + name + ".")
-
-        load(self)
-
-        return error_msgs
-
-    def init_from_ckpt(self, path, ignore_keys=list()):
-        sd = torch.load(path, map_location="cpu")["state_dict"]
-        keys = list(sd.keys())
-        for k in keys:
-            for ik in ignore_keys:
-                if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
-                    del sd[k]
-        self.load_state_dict(sd, strict=False)
-        print(f"Restored from {path}")
-
-    def encode(self, x):
-        h = self.encoder(x)
-        moments = self.quant_conv(h)
-        posterior = DiagonalGaussianDistribution(moments)
-        return posterior
-
-    def decode(self, z):
-        z = self.post_quant_conv(z)
-        dec = self.decoder(z)
-        return dec
-
-    def forward(self, input, sample_posterior=True):
-        posterior = self.encode(input)
-        if sample_posterior:
-            z = posterior.sample()
-        else:
-            z = posterior.mode()
-        dec = self.decode(z)
-        return dec, posterior
-
-    def get_input(self, batch, k):
-        x = batch[k]
-        if len(x.shape) == 3:
-            x = x[..., None]
-        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
-        return x
-
-    def training_step(self, batch, batch_idx, optimizer_idx):
-        inputs = self.get_input(batch, self.image_key)
-        reconstructions, posterior = self(inputs)
-
-        if optimizer_idx == 0:
-            # train encoder+decoder+logvar
-            aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
-                                            last_layer=self.get_last_layer(), split="train")
-            self.log("aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
-            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=False)
-            return aeloss
-
-        if optimizer_idx == 1:
-            # train the discriminator
-            discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
-                                                last_layer=self.get_last_layer(), split="train")
-
-            self.log("discloss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
-            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=False)
-            return discloss
-
-    def validation_step(self, batch, batch_idx):
-        inputs = self.get_input(batch, self.image_key)
-        reconstructions, posterior = self(inputs)
-        aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, self.global_step,
-                                        last_layer=self.get_last_layer(), split="val")
-
-        discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, self.global_step,
-                                            last_layer=self.get_last_layer(), split="val")
-
-        self.log("val/rec_loss", log_dict_ae["val/rec_loss"])
-        self.log_dict(log_dict_ae)
-        self.log_dict(log_dict_disc)
-        return self.log_dict
-
-    def configure_optimizers(self):
-        lr = self.learning_rate
-        opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
-                                  list(self.decoder.parameters())+
-                                  list(self.quant_conv.parameters())+
-                                  list(self.post_quant_conv.parameters()),
-                                  lr=lr, betas=(0.5, 0.9))
-        opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
-                                    lr=lr, betas=(0.5, 0.9))
-        return [opt_ae, opt_disc], []
-
-    def get_last_layer(self):
-        return self.decoder.conv_out.weight
-
-    @torch.no_grad()
-    def log_images(self, batch, only_inputs=False, **kwargs):
-        log = dict()
-        x = self.get_input(batch, self.image_key)
-        x = x.to(self.device)
-        if not only_inputs:
-            xrec, posterior = self(x)
-            if x.shape[1] > 3:
-                # colorize with random projection
-                assert xrec.shape[1] > 3
-                x = self.to_rgb(x)
-                xrec = self.to_rgb(xrec)
-            log["samples"] = self.decode(torch.randn_like(posterior.sample()))
-            log["reconstructions"] = xrec
-        log["inputs"] = x
-        return log
-
-    def to_rgb(self, x):
-        assert self.image_key == "segmentation"
-        if not hasattr(self, "colorize"):
-            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
-        x = F.conv2d(x, weight=self.colorize)
-        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
-        return x
-
-
-class IdentityFirstStage(torch.nn.Module):
-    def __init__(self, *args, vq_interface=False, **kwargs):
-        self.vq_interface = vq_interface  # TODO: Should be true by default but check to not break older stuff
-        super().__init__()
-
-    def encode(self, x, *args, **kwargs):
-        return x
-
-    def decode(self, x, *args, **kwargs):
-        return x
-
-    def quantize(self, x, *args, **kwargs):
-        if self.vq_interface:
-            return x, None, [None, None, None]
-        return x
-
-    def forward(self, x, *args, **kwargs):
-        return x
diff --git a/examples/tutorial/stable_diffusion/ldm/models/diffusion/__init__.py b/examples/tutorial/stable_diffusion/ldm/models/diffusion/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/examples/tutorial/stable_diffusion/ldm/models/diffusion/classifier.py b/examples/tutorial/stable_diffusion/ldm/models/diffusion/classifier.py
deleted file mode 100644
index 67e98b9d8ffb..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/models/diffusion/classifier.py
+++ /dev/null
@@ -1,267 +0,0 @@
-import os
-import torch
-import pytorch_lightning as pl
-from omegaconf import OmegaConf
-from torch.nn import functional as F
-from torch.optim import AdamW
-from torch.optim.lr_scheduler import LambdaLR
-from copy import deepcopy
-from einops import rearrange
-from glob import glob
-from natsort import natsorted
-
-from ldm.modules.diffusionmodules.openaimodel import EncoderUNetModel, UNetModel
-from ldm.util import log_txt_as_img, default, ismap, instantiate_from_config
-
-__models__ = {
-    'class_label': EncoderUNetModel,
-    'segmentation': UNetModel
-}
-
-
-def disabled_train(self, mode=True):
-    """Overwrite model.train with this function to make sure train/eval mode
-    does not change anymore."""
-    return self
-
-
-class NoisyLatentImageClassifier(pl.LightningModule):
-
-    def __init__(self,
-                 diffusion_path,
-                 num_classes,
-                 ckpt_path=None,
-                 pool='attention',
-                 label_key=None,
-                 diffusion_ckpt_path=None,
-                 scheduler_config=None,
-                 weight_decay=1.e-2,
-                 log_steps=10,
-                 monitor='val/loss',
-                 *args,
-                 **kwargs):
-        super().__init__(*args, **kwargs)
-        self.num_classes = num_classes
-        # get latest config of diffusion model
-        diffusion_config = natsorted(glob(os.path.join(diffusion_path, 'configs', '*-project.yaml')))[-1]
-        self.diffusion_config = OmegaConf.load(diffusion_config).model
-        self.diffusion_config.params.ckpt_path = diffusion_ckpt_path
-        self.load_diffusion()
-
-        self.monitor = monitor
-        self.numd = self.diffusion_model.first_stage_model.encoder.num_resolutions - 1
-        self.log_time_interval = self.diffusion_model.num_timesteps // log_steps
-        self.log_steps = log_steps
-
-        self.label_key = label_key if not hasattr(self.diffusion_model, 'cond_stage_key') \
-            else self.diffusion_model.cond_stage_key
-
-        assert self.label_key is not None, 'label_key neither in diffusion model nor in model.params'
-
-        if self.label_key not in __models__:
-            raise NotImplementedError()
-
-        self.load_classifier(ckpt_path, pool)
-
-        self.scheduler_config = scheduler_config
-        self.use_scheduler = self.scheduler_config is not None
-        self.weight_decay = weight_decay
-
-    def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
-        sd = torch.load(path, map_location="cpu")
-        if "state_dict" in list(sd.keys()):
-            sd = sd["state_dict"]
-        keys = list(sd.keys())
-        for k in keys:
-            for ik in ignore_keys:
-                if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
-                    del sd[k]
-        missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
-            sd, strict=False)
-        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
-        if len(missing) > 0:
-            print(f"Missing Keys: {missing}")
-        if len(unexpected) > 0:
-            print(f"Unexpected Keys: {unexpected}")
-
-    def load_diffusion(self):
-        model = instantiate_from_config(self.diffusion_config)
-        self.diffusion_model = model.eval()
-        self.diffusion_model.train = disabled_train
-        for param in self.diffusion_model.parameters():
-            param.requires_grad = False
-
-    def load_classifier(self, ckpt_path, pool):
-        model_config = deepcopy(self.diffusion_config.params.unet_config.params)
-        model_config.in_channels = self.diffusion_config.params.unet_config.params.out_channels
-        model_config.out_channels = self.num_classes
-        if self.label_key == 'class_label':
-            model_config.pool = pool
-
-        self.model = __models__[self.label_key](**model_config)
-        if ckpt_path is not None:
-            print('#####################################################################')
-            print(f'load from ckpt "{ckpt_path}"')
-            print('#####################################################################')
-            self.init_from_ckpt(ckpt_path)
-
-    @torch.no_grad()
-    def get_x_noisy(self, x, t, noise=None):
-        noise = default(noise, lambda: torch.randn_like(x))
-        continuous_sqrt_alpha_cumprod = None
-        if self.diffusion_model.use_continuous_noise:
-            continuous_sqrt_alpha_cumprod = self.diffusion_model.sample_continuous_noise_level(x.shape[0], t + 1)
-            # todo: make sure t+1 is correct here
-
-        return self.diffusion_model.q_sample(x_start=x, t=t, noise=noise,
-                                             continuous_sqrt_alpha_cumprod=continuous_sqrt_alpha_cumprod)
-
-    def forward(self, x_noisy, t, *args, **kwargs):
-        return self.model(x_noisy, t)
-
-    @torch.no_grad()
-    def get_input(self, batch, k):
-        x = batch[k]
-        if len(x.shape) == 3:
-            x = x[..., None]
-        x = rearrange(x, 'b h w c -> b c h w')
-        x = x.to(memory_format=torch.contiguous_format).float()
-        return x
-
-    @torch.no_grad()
-    def get_conditioning(self, batch, k=None):
-        if k is None:
-            k = self.label_key
-        assert k is not None, 'Needs to provide label key'
-
-        targets = batch[k].to(self.device)
-
-        if self.label_key == 'segmentation':
-            targets = rearrange(targets, 'b h w c -> b c h w')
-            for down in range(self.numd):
-                h, w = targets.shape[-2:]
-                targets = F.interpolate(targets, size=(h // 2, w // 2), mode='nearest')
-
-            # targets = rearrange(targets,'b c h w -> b h w c')
-
-        return targets
-
-    def compute_top_k(self, logits, labels, k, reduction="mean"):
-        _, top_ks = torch.topk(logits, k, dim=1)
-        if reduction == "mean":
-            return (top_ks == labels[:, None]).float().sum(dim=-1).mean().item()
-        elif reduction == "none":
-            return (top_ks == labels[:, None]).float().sum(dim=-1)
-
-    def on_train_epoch_start(self):
-        # save some memory
-        self.diffusion_model.model.to('cpu')
-
-    @torch.no_grad()
-    def write_logs(self, loss, logits, targets):
-        log_prefix = 'train' if self.training else 'val'
-        log = {}
-        log[f"{log_prefix}/loss"] = loss.mean()
-        log[f"{log_prefix}/acc@1"] = self.compute_top_k(
-            logits, targets, k=1, reduction="mean"
-        )
-        log[f"{log_prefix}/acc@5"] = self.compute_top_k(
-            logits, targets, k=5, reduction="mean"
-        )
-
-        self.log_dict(log, prog_bar=False, logger=True, on_step=self.training, on_epoch=True)
-        self.log('loss', log[f"{log_prefix}/loss"], prog_bar=True, logger=False)
-        self.log('global_step', self.global_step, logger=False, on_epoch=False, prog_bar=True)
-        lr = self.optimizers().param_groups[0]['lr']
-        self.log('lr_abs', lr, on_step=True, logger=True, on_epoch=False, prog_bar=True)
-
-    def shared_step(self, batch, t=None):
-        x, *_ = self.diffusion_model.get_input(batch, k=self.diffusion_model.first_stage_key)
-        targets = self.get_conditioning(batch)
-        if targets.dim() == 4:
-            targets = targets.argmax(dim=1)
-        if t is None:
-            t = torch.randint(0, self.diffusion_model.num_timesteps, (x.shape[0],), device=self.device).long()
-        else:
-            t = torch.full(size=(x.shape[0],), fill_value=t, device=self.device).long()
-        x_noisy = self.get_x_noisy(x, t)
-        logits = self(x_noisy, t)
-
-        loss = F.cross_entropy(logits, targets, reduction='none')
-
-        self.write_logs(loss.detach(), logits.detach(), targets.detach())
-
-        loss = loss.mean()
-        return loss, logits, x_noisy, targets
-
-    def training_step(self, batch, batch_idx):
-        loss, *_ = self.shared_step(batch)
-        return loss
-
-    def reset_noise_accs(self):
-        self.noisy_acc = {t: {'acc@1': [], 'acc@5': []} for t in
-                          range(0, self.diffusion_model.num_timesteps, self.diffusion_model.log_every_t)}
-
-    def on_validation_start(self):
-        self.reset_noise_accs()
-
-    @torch.no_grad()
-    def validation_step(self, batch, batch_idx):
-        loss, *_ = self.shared_step(batch)
-
-        for t in self.noisy_acc:
-            _, logits, _, targets = self.shared_step(batch, t)
-            self.noisy_acc[t]['acc@1'].append(self.compute_top_k(logits, targets, k=1, reduction='mean'))
-            self.noisy_acc[t]['acc@5'].append(self.compute_top_k(logits, targets, k=5, reduction='mean'))
-
-        return loss
-
-    def configure_optimizers(self):
-        optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
-
-        if self.use_scheduler:
-            scheduler = instantiate_from_config(self.scheduler_config)
-
-            print("Setting up LambdaLR scheduler...")
-            scheduler = [
-                {
-                    'scheduler': LambdaLR(optimizer, lr_lambda=scheduler.schedule),
-                    'interval': 'step',
-                    'frequency': 1
-                }]
-            return [optimizer], scheduler
-
-        return optimizer
-
-    @torch.no_grad()
-    def log_images(self, batch, N=8, *args, **kwargs):
-        log = dict()
-        x = self.get_input(batch, self.diffusion_model.first_stage_key)
-        log['inputs'] = x
-
-        y = self.get_conditioning(batch)
-
-        if self.label_key == 'class_label':
-            y = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"])
-            log['labels'] = y
-
-        if ismap(y):
-            log['labels'] = self.diffusion_model.to_rgb(y)
-
-            for step in range(self.log_steps):
-                current_time = step * self.log_time_interval
-
-                _, logits, x_noisy, _ = self.shared_step(batch, t=current_time)
-
-                log[f'inputs@t{current_time}'] = x_noisy
-
-                pred = F.one_hot(logits.argmax(dim=1), num_classes=self.num_classes)
-                pred = rearrange(pred, 'b h w c -> b c h w')
-
-                log[f'pred@t{current_time}'] = self.diffusion_model.to_rgb(pred)
-
-        for key in log:
-            log[key] = log[key][:N]
-
-        return log
diff --git a/examples/tutorial/stable_diffusion/ldm/models/diffusion/ddim.py b/examples/tutorial/stable_diffusion/ldm/models/diffusion/ddim.py
deleted file mode 100644
index 91335d6372df..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/models/diffusion/ddim.py
+++ /dev/null
@@ -1,240 +0,0 @@
-"""SAMPLING ONLY."""
-
-import torch
-import numpy as np
-from tqdm import tqdm
-from functools import partial
-
-from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like, \
-    extract_into_tensor
-
-
-class DDIMSampler(object):
-    def __init__(self, model, schedule="linear", **kwargs):
-        super().__init__()
-        self.model = model
-        self.ddpm_num_timesteps = model.num_timesteps
-        self.schedule = schedule
-
-    def register_buffer(self, name, attr):
-        if type(attr) == torch.Tensor:
-            if attr.device != torch.device("cuda"):
-                attr = attr.to(torch.device("cuda"))
-        setattr(self, name, attr)
-
-    def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
-        self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
-                                                  num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
-        alphas_cumprod = self.model.alphas_cumprod
-        assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
-        to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
-
-        self.register_buffer('betas', to_torch(self.model.betas))
-        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
-        self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
-
-        # calculations for diffusion q(x_t | x_{t-1}) and others
-        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
-        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
-
-        # ddim sampling parameters
-        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
-                                                                                   ddim_timesteps=self.ddim_timesteps,
-                                                                                   eta=ddim_eta,verbose=verbose)
-        self.register_buffer('ddim_sigmas', ddim_sigmas)
-        self.register_buffer('ddim_alphas', ddim_alphas)
-        self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
-        self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
-        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
-            (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
-                        1 - self.alphas_cumprod / self.alphas_cumprod_prev))
-        self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
-
-    @torch.no_grad()
-    def sample(self,
-               S,
-               batch_size,
-               shape,
-               conditioning=None,
-               callback=None,
-               normals_sequence=None,
-               img_callback=None,
-               quantize_x0=False,
-               eta=0.,
-               mask=None,
-               x0=None,
-               temperature=1.,
-               noise_dropout=0.,
-               score_corrector=None,
-               corrector_kwargs=None,
-               verbose=True,
-               x_T=None,
-               log_every_t=100,
-               unconditional_guidance_scale=1.,
-               unconditional_conditioning=None,
-               # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
-               **kwargs
-               ):
-        if conditioning is not None:
-            if isinstance(conditioning, dict):
-                cbs = conditioning[list(conditioning.keys())[0]].shape[0]
-                if cbs != batch_size:
-                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
-            else:
-                if conditioning.shape[0] != batch_size:
-                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
-
-        self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
-        # sampling
-        C, H, W = shape
-        size = (batch_size, C, H, W)
-        print(f'Data shape for DDIM sampling is {size}, eta {eta}')
-
-        samples, intermediates = self.ddim_sampling(conditioning, size,
-                                                    callback=callback,
-                                                    img_callback=img_callback,
-                                                    quantize_denoised=quantize_x0,
-                                                    mask=mask, x0=x0,
-                                                    ddim_use_original_steps=False,
-                                                    noise_dropout=noise_dropout,
-                                                    temperature=temperature,
-                                                    score_corrector=score_corrector,
-                                                    corrector_kwargs=corrector_kwargs,
-                                                    x_T=x_T,
-                                                    log_every_t=log_every_t,
-                                                    unconditional_guidance_scale=unconditional_guidance_scale,
-                                                    unconditional_conditioning=unconditional_conditioning,
-                                                    )
-        return samples, intermediates
-
-    @torch.no_grad()
-    def ddim_sampling(self, cond, shape,
-                      x_T=None, ddim_use_original_steps=False,
-                      callback=None, timesteps=None, quantize_denoised=False,
-                      mask=None, x0=None, img_callback=None, log_every_t=100,
-                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None,):
-        device = self.model.betas.device
-        b = shape[0]
-        if x_T is None:
-            img = torch.randn(shape, device=device)
-        else:
-            img = x_T
-
-        if timesteps is None:
-            timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
-        elif timesteps is not None and not ddim_use_original_steps:
-            subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
-            timesteps = self.ddim_timesteps[:subset_end]
-
-        intermediates = {'x_inter': [img], 'pred_x0': [img]}
-        time_range = reversed(range(0,timesteps)) if ddim_use_original_steps else np.flip(timesteps)
-        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
-        print(f"Running DDIM Sampling with {total_steps} timesteps")
-
-        iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps)
-
-        for i, step in enumerate(iterator):
-            index = total_steps - i - 1
-            ts = torch.full((b,), step, device=device, dtype=torch.long)
-
-            if mask is not None:
-                assert x0 is not None
-                img_orig = self.model.q_sample(x0, ts)  # TODO: deterministic forward pass?
-                img = img_orig * mask + (1. - mask) * img
-            outs = self.p_sample_ddim(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
-                                      quantize_denoised=quantize_denoised, temperature=temperature,
-                                      noise_dropout=noise_dropout, score_corrector=score_corrector,
-                                      corrector_kwargs=corrector_kwargs,
-                                      unconditional_guidance_scale=unconditional_guidance_scale,
-                                      unconditional_conditioning=unconditional_conditioning)
-            img, pred_x0 = outs
-            if callback: callback(i)
-            if img_callback: img_callback(pred_x0, i)
-
-            if index % log_every_t == 0 or index == total_steps - 1:
-                intermediates['x_inter'].append(img)
-                intermediates['pred_x0'].append(pred_x0)
-
-        return img, intermediates
-
-    @torch.no_grad()
-    def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
-                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None):
-        b, *_, device = *x.shape, x.device
-
-        if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
-            e_t = self.model.apply_model(x, t, c)
-        else:
-            x_in = torch.cat([x] * 2)
-            t_in = torch.cat([t] * 2)
-            c_in = torch.cat([unconditional_conditioning, c])
-            e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
-            e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
-
-        if score_corrector is not None:
-            assert self.model.parameterization == "eps"
-            e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
-
-        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
-        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
-        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
-        sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
-        # select parameters corresponding to the currently considered timestep
-        a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
-        a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
-        sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
-        sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
-
-        # current prediction for x_0
-        pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
-        if quantize_denoised:
-            pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
-        # direction pointing to x_t
-        dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
-        noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
-        if noise_dropout > 0.:
-            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
-        x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
-        return x_prev, pred_x0
-
-    @torch.no_grad()
-    def stochastic_encode(self, x0, t, use_original_steps=False, noise=None):
-        # fast, but does not allow for exact reconstruction
-        # t serves as an index to gather the correct alphas
-        if use_original_steps:
-            sqrt_alphas_cumprod = self.sqrt_alphas_cumprod
-            sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod
-        else:
-            sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas)
-            sqrt_one_minus_alphas_cumprod = self.ddim_sqrt_one_minus_alphas
-
-        if noise is None:
-            noise = torch.randn_like(x0)
-        return (extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0 +
-                extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape) * noise)
-
-    @torch.no_grad()
-    def decode(self, x_latent, cond, t_start, unconditional_guidance_scale=1.0, unconditional_conditioning=None,
-               use_original_steps=False):
-
-        timesteps = np.arange(self.ddpm_num_timesteps) if use_original_steps else self.ddim_timesteps
-        timesteps = timesteps[:t_start]
-
-        time_range = np.flip(timesteps)
-        total_steps = timesteps.shape[0]
-        print(f"Running DDIM Sampling with {total_steps} timesteps")
-
-        iterator = tqdm(time_range, desc='Decoding image', total=total_steps)
-        x_dec = x_latent
-        for i, step in enumerate(iterator):
-            index = total_steps - i - 1
-            ts = torch.full((x_latent.shape[0],), step, device=x_latent.device, dtype=torch.long)
-            x_dec, _ = self.p_sample_ddim(x_dec, cond, ts, index=index, use_original_steps=use_original_steps,
-                                          unconditional_guidance_scale=unconditional_guidance_scale,
-                                          unconditional_conditioning=unconditional_conditioning)
-        return x_dec
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/ldm/models/diffusion/ddpm.py b/examples/tutorial/stable_diffusion/ldm/models/diffusion/ddpm.py
deleted file mode 100644
index 9633ec3d843a..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/models/diffusion/ddpm.py
+++ /dev/null
@@ -1,1554 +0,0 @@
-import torch
-import torch.nn as nn
-import numpy as np
-import pytorch_lightning as pl
-from torch.optim.lr_scheduler import LambdaLR
-from einops import rearrange, repeat
-from contextlib import contextmanager
-from functools import partial
-from tqdm import tqdm
-from torchvision.utils import make_grid
-
-from pytorch_lightning.utilities.rank_zero import rank_zero_only
-from pytorch_lightning.utilities import rank_zero_info
-
-from ldm.util import log_txt_as_img, exists, default, ismap, isimage, mean_flat, count_params, instantiate_from_config
-from ldm.modules.ema import LitEma
-from ldm.modules.distributions.distributions import normal_kl, DiagonalGaussianDistribution
-from ldm.models.autoencoder import VQModelInterface, IdentityFirstStage, AutoencoderKL
-from ldm.modules.diffusionmodules.util import make_beta_schedule, extract_into_tensor, noise_like
-from ldm.models.diffusion.ddim import DDIMSampler
-from ldm.modules.diffusionmodules.openaimodel import AttentionPool2d
-from ldm.modules.x_transformer import *
-from ldm.modules.encoders.modules import *
-
-from ldm.modules.ema import LitEma
-from ldm.modules.distributions.distributions import normal_kl, DiagonalGaussianDistribution
-from ldm.models.autoencoder import *
-from ldm.models.diffusion.ddim import *
-from ldm.modules.diffusionmodules.openaimodel import *
-from ldm.modules.diffusionmodules.model import *
-
-
-from ldm.modules.diffusionmodules.model import Model, Encoder, Decoder
-
-from ldm.util import instantiate_from_config
-
-from einops import rearrange, repeat
-
-
-
-
-__conditioning_keys__ = {'concat': 'c_concat',
-                         'crossattn': 'c_crossattn',
-                         'adm': 'y'}
-
-
-def disabled_train(self, mode=True):
-    """Overwrite model.train with this function to make sure train/eval mode
-    does not change anymore."""
-    return self
-
-
-def uniform_on_device(r1, r2, shape, device):
-    return (r1 - r2) * torch.rand(*shape, device=device) + r2
-
-
-class DDPM(pl.LightningModule):
-    # classic DDPM with Gaussian diffusion, in image space
-    def __init__(self,
-                 unet_config,
-                 timesteps=1000,
-                 beta_schedule="linear",
-                 loss_type="l2",
-                 ckpt_path=None,
-                 ignore_keys=[],
-                 load_only_unet=False,
-                 monitor="val/loss",
-                 use_ema=True,
-                 first_stage_key="image",
-                 image_size=256,
-                 channels=3,
-                 log_every_t=100,
-                 clip_denoised=True,
-                 linear_start=1e-4,
-                 linear_end=2e-2,
-                 cosine_s=8e-3,
-                 given_betas=None,
-                 original_elbo_weight=0.,
-                 v_posterior=0.,  # weight for choosing posterior variance as sigma = (1-v) * beta_tilde + v * beta
-                 l_simple_weight=1.,
-                 conditioning_key=None,
-                 parameterization="eps",  # all assuming fixed variance schedules
-                 scheduler_config=None,
-                 use_positional_encodings=False,
-                 learn_logvar=False,
-                 logvar_init=0.,
-                 use_fp16 = True,
-                 ):
-        super().__init__()
-        assert parameterization in ["eps", "x0"], 'currently only supporting "eps" and "x0"'
-        self.parameterization = parameterization
-        rank_zero_info(f"{self.__class__.__name__}: Running in {self.parameterization}-prediction mode")
-        self.cond_stage_model = None
-        self.clip_denoised = clip_denoised
-        self.log_every_t = log_every_t
-        self.first_stage_key = first_stage_key
-        self.image_size = image_size  # try conv?
-        self.channels = channels
-        self.use_positional_encodings = use_positional_encodings
-        self.unet_config = unet_config
-        self.conditioning_key = conditioning_key
-        # self.model = DiffusionWrapper(unet_config, conditioning_key)
-        # count_params(self.model, verbose=True)
-        self.use_ema = use_ema
-        # if self.use_ema:
-        #     self.model_ema = LitEma(self.model)
-        #     print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
-
-        self.use_scheduler = scheduler_config is not None
-        if self.use_scheduler:
-            self.scheduler_config = scheduler_config
-
-        self.v_posterior = v_posterior
-        self.original_elbo_weight = original_elbo_weight
-        self.l_simple_weight = l_simple_weight
-
-        if monitor is not None:
-            self.monitor = monitor
-        self.ckpt_path = ckpt_path
-        self.ignore_keys = ignore_keys
-        self.load_only_unet = load_only_unet
-        self.given_betas = given_betas
-        self.beta_schedule = beta_schedule
-        self.timesteps = timesteps
-        self.linear_start = linear_start
-        self.linear_end = linear_end
-        self.cosine_s = cosine_s
-        # if ckpt_path is not None:
-        #     self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys, only_model=load_only_unet)
-        #
-        # self.register_schedule(given_betas=given_betas, beta_schedule=beta_schedule, timesteps=timesteps,
-        #                        linear_start=linear_start, linear_end=linear_end, cosine_s=cosine_s)
-
-        self.loss_type = loss_type
-
-        self.learn_logvar = learn_logvar
-        self.logvar_init = logvar_init
-        # self.logvar = torch.full(fill_value=logvar_init, size=(self.num_timesteps,))
-        # if self.learn_logvar:
-        #     self.logvar = nn.Parameter(self.logvar, requires_grad=True)
-        #     self.logvar = nn.Parameter(self.logvar, requires_grad=True)
-
-        self.use_fp16 = use_fp16
-        if use_fp16:
-            self.unet_config["params"].update({"use_fp16": True})
-            rank_zero_info("Using FP16 for UNet = {}".format(self.unet_config["params"]["use_fp16"]))
-        else:
-            self.unet_config["params"].update({"use_fp16": False})
-            rank_zero_info("Using FP16 for UNet = {}".format(self.unet_config["params"]["use_fp16"]))
-
-    def register_schedule(self, given_betas=None, beta_schedule="linear", timesteps=1000,
-                          linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
-        if exists(given_betas):
-            betas = given_betas
-        else:
-            betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end,
-                                       cosine_s=cosine_s)
-        alphas = 1. - betas
-        alphas_cumprod = np.cumprod(alphas, axis=0)
-        alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
-
-        timesteps, = betas.shape
-        self.num_timesteps = int(timesteps)
-        self.linear_start = linear_start
-        self.linear_end = linear_end
-        assert alphas_cumprod.shape[0] == self.num_timesteps, 'alphas have to be defined for each timestep'
-
-        to_torch = partial(torch.tensor, dtype=torch.float32)
-
-        self.register_buffer('betas', to_torch(betas))
-        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
-        self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
-
-        # calculations for diffusion q(x_t | x_{t-1}) and others
-        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
-        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
-        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
-        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
-        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
-
-        # calculations for posterior q(x_{t-1} | x_t, x_0)
-        posterior_variance = (1 - self.v_posterior) * betas * (1. - alphas_cumprod_prev) / (
-                    1. - alphas_cumprod) + self.v_posterior * betas
-        # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
-        self.register_buffer('posterior_variance', to_torch(posterior_variance))
-        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
-        self.register_buffer('posterior_log_variance_clipped', to_torch(np.log(np.maximum(posterior_variance, 1e-20))))
-        self.register_buffer('posterior_mean_coef1', to_torch(
-            betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod)))
-        self.register_buffer('posterior_mean_coef2', to_torch(
-            (1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod)))
-
-        if self.parameterization == "eps":
-            lvlb_weights = self.betas ** 2 / (
-                        2 * self.posterior_variance * to_torch(alphas) * (1 - self.alphas_cumprod))
-        elif self.parameterization == "x0":
-            lvlb_weights = 0.5 * np.sqrt(torch.Tensor(alphas_cumprod)) / (2. * 1 - torch.Tensor(alphas_cumprod))
-        else:
-            raise NotImplementedError("mu not supported")
-        # TODO how to choose this term
-        lvlb_weights[0] = lvlb_weights[1]
-        self.register_buffer('lvlb_weights', lvlb_weights, persistent=False)
-        assert not torch.isnan(self.lvlb_weights).all()
-
-    @contextmanager
-    def ema_scope(self, context=None):
-        if self.use_ema:
-            self.model_ema.store(self.model.parameters())
-            self.model_ema.copy_to(self.model)
-            if context is not None:
-                print(f"{context}: Switched to EMA weights")
-        try:
-            yield None
-        finally:
-            if self.use_ema:
-                self.model_ema.restore(self.model.parameters())
-                if context is not None:
-                    print(f"{context}: Restored training weights")
-
-    def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
-        sd = torch.load(path, map_location="cpu")
-        if "state_dict" in list(sd.keys()):
-            sd = sd["state_dict"]
-        keys = list(sd.keys())
-        for k in keys:
-            for ik in ignore_keys:
-                if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
-                    del sd[k]
-        missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
-            sd, strict=False)
-        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
-        if len(missing) > 0:
-            print(f"Missing Keys: {missing}")
-        if len(unexpected) > 0:
-            print(f"Unexpected Keys: {unexpected}")
-
-    def q_mean_variance(self, x_start, t):
-        """
-        Get the distribution q(x_t | x_0).
-        :param x_start: the [N x C x ...] tensor of noiseless inputs.
-        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
-        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
-        """
-        mean = (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start)
-        variance = extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
-        log_variance = extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
-        return mean, variance, log_variance
-
-    def predict_start_from_noise(self, x_t, t, noise):
-        return (
-                extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t -
-                extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
-        )
-
-    def q_posterior(self, x_start, x_t, t):
-        posterior_mean = (
-                extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start +
-                extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
-        )
-        posterior_variance = extract_into_tensor(self.posterior_variance, t, x_t.shape)
-        posterior_log_variance_clipped = extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape)
-        return posterior_mean, posterior_variance, posterior_log_variance_clipped
-
-    def p_mean_variance(self, x, t, clip_denoised: bool):
-        model_out = self.model(x, t)
-        if self.parameterization == "eps":
-            x_recon = self.predict_start_from_noise(x, t=t, noise=model_out)
-        elif self.parameterization == "x0":
-            x_recon = model_out
-        if clip_denoised:
-            x_recon.clamp_(-1., 1.)
-
-        model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
-        return model_mean, posterior_variance, posterior_log_variance
-
-    @torch.no_grad()
-    def p_sample(self, x, t, clip_denoised=True, repeat_noise=False):
-        b, *_, device = *x.shape, x.device
-        model_mean, _, model_log_variance = self.p_mean_variance(x=x, t=t, clip_denoised=clip_denoised)
-        noise = noise_like(x.shape, device, repeat_noise)
-        # no noise when t == 0
-        nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
-        return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
-
-    @torch.no_grad()
-    def p_sample_loop(self, shape, return_intermediates=False):
-        device = self.betas.device
-        b = shape[0]
-        img = torch.randn(shape, device=device)
-        intermediates = [img]
-        for i in tqdm(reversed(range(0, self.num_timesteps)), desc='Sampling t', total=self.num_timesteps):
-            img = self.p_sample(img, torch.full((b,), i, device=device, dtype=torch.long),
-                                clip_denoised=self.clip_denoised)
-            if i % self.log_every_t == 0 or i == self.num_timesteps - 1:
-                intermediates.append(img)
-        if return_intermediates:
-            return img, intermediates
-        return img
-
-    @torch.no_grad()
-    def sample(self, batch_size=16, return_intermediates=False):
-        image_size = self.image_size
-        channels = self.channels
-        return self.p_sample_loop((batch_size, channels, image_size, image_size),
-                                  return_intermediates=return_intermediates)
-
-    def q_sample(self, x_start, t, noise=None):
-        noise = default(noise, lambda: torch.randn_like(x_start))
-        return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
-                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise)
-
-    def get_loss(self, pred, target, mean=True):
-
-        if pred.isnan().any():
-            print("Warning: Prediction has nan values")
-            lr = self.optimizers().param_groups[0]['lr']
-            # self.log('lr_abs', lr, prog_bar=True, logger=True, on_step=True, on_epoch=False)
-            print(f"lr: {lr}")
-        if pred.isinf().any():
-            print("Warning: Prediction has inf values")
-
-        if self.use_fp16:
-            target = target.half()
-
-        if self.loss_type == 'l1':
-            loss = (target - pred).abs()
-            if mean:
-                loss = loss.mean()
-        elif self.loss_type == 'l2':
-            if mean:
-                loss = torch.nn.functional.mse_loss(target, pred)
-            else:
-                loss = torch.nn.functional.mse_loss(target, pred, reduction='none')
-        else:
-            raise NotImplementedError("unknown loss type '{loss_type}'")
-                   
-        if loss.isnan().any():
-            print("Warning: loss has nan values")
-            print("loss: ", loss[0][0][0])
-            raise ValueError("loss has nan values")
-        if loss.isinf().any():
-            print("Warning: loss has inf values")
-            print("loss: ", loss)
-            raise ValueError("loss has inf values")
-
-        return loss
-
-    def p_losses(self, x_start, t, noise=None):
-        noise = default(noise, lambda: torch.randn_like(x_start))
-        x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
-        model_out = self.model(x_noisy, t)
-
-        loss_dict = {}
-        if self.parameterization == "eps":
-            target = noise
-        elif self.parameterization == "x0":
-            target = x_start
-        else:
-            raise NotImplementedError(f"Paramterization {self.parameterization} not yet supported")
-
-        loss = self.get_loss(model_out, target, mean=False).mean(dim=[1, 2, 3])
-
-        log_prefix = 'train' if self.training else 'val'
-
-        loss_dict.update({f'{log_prefix}/loss_simple': loss.mean()})
-        loss_simple = loss.mean() * self.l_simple_weight
-
-        loss_vlb = (self.lvlb_weights[t] * loss).mean()
-        loss_dict.update({f'{log_prefix}/loss_vlb': loss_vlb})
-
-        loss = loss_simple + self.original_elbo_weight * loss_vlb
-
-        loss_dict.update({f'{log_prefix}/loss': loss})
-
-        return loss, loss_dict
-
-    def forward(self, x, *args, **kwargs):
-        # b, c, h, w, device, img_size, = *x.shape, x.device, self.image_size
-        # assert h == img_size and w == img_size, f'height and width of image must be {img_size}'
-        t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=self.device).long()
-        return self.p_losses(x, t, *args, **kwargs)
-
-    def get_input(self, batch, k):
-        # print("+" * 30)
-        # print(batch['jpg'].shape)
-        # print(len(batch['txt']))
-        # print(k)
-        # print("=" * 30)
-        if not isinstance(batch, torch.Tensor):
-            x = batch[k]
-        else:
-            x = batch
-        if len(x.shape) == 3:
-            x = x[..., None]
-        x = rearrange(x, 'b h w c -> b c h w')
-
-        if self.use_fp16:
-            x = x.to(memory_format=torch.contiguous_format).float().half()
-        else:
-            x = x.to(memory_format=torch.contiguous_format).float()
-
-        return x
-
-    def shared_step(self, batch):
-        x = self.get_input(batch, self.first_stage_key)
-        loss, loss_dict = self(x)
-        return loss, loss_dict
-
-    def training_step(self, batch, batch_idx):
-        loss, loss_dict = self.shared_step(batch)
-
-        self.log_dict(loss_dict, prog_bar=True,
-                      logger=True, on_step=True, on_epoch=True)
-
-        self.log("global_step", self.global_step,
-                 prog_bar=True, logger=True, on_step=True, on_epoch=False)
-
-        if self.use_scheduler:
-            lr = self.optimizers().param_groups[0]['lr']
-            self.log('lr_abs', lr, prog_bar=True, logger=True, on_step=True, on_epoch=False)
-
-        return loss
-
-    @torch.no_grad()
-    def validation_step(self, batch, batch_idx):
-        _, loss_dict_no_ema = self.shared_step(batch)
-        with self.ema_scope():
-            _, loss_dict_ema = self.shared_step(batch)
-            loss_dict_ema = {key + '_ema': loss_dict_ema[key] for key in loss_dict_ema}
-        self.log_dict(loss_dict_no_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True)
-        self.log_dict(loss_dict_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True)
-
-    def on_train_batch_end(self, *args, **kwargs):
-        if self.use_ema:
-            self.model_ema(self.model)
-
-    def _get_rows_from_list(self, samples):
-        n_imgs_per_row = len(samples)
-        denoise_grid = rearrange(samples, 'n b c h w -> b n c h w')
-        denoise_grid = rearrange(denoise_grid, 'b n c h w -> (b n) c h w')
-        denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row)
-        return denoise_grid
-
-    @torch.no_grad()
-    def log_images(self, batch, N=8, n_row=2, sample=True, return_keys=None, **kwargs):
-        log = dict()
-        x = self.get_input(batch, self.first_stage_key)
-        N = min(x.shape[0], N)
-        n_row = min(x.shape[0], n_row)
-        x = x.to(self.device)[:N]
-        log["inputs"] = x
-
-        # get diffusion row
-        diffusion_row = list()
-        x_start = x[:n_row]
-
-        for t in range(self.num_timesteps):
-            if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
-                t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
-                t = t.to(self.device).long()
-                noise = torch.randn_like(x_start)
-                x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
-                diffusion_row.append(x_noisy)
-
-        log["diffusion_row"] = self._get_rows_from_list(diffusion_row)
-
-        if sample:
-            # get denoise row
-            with self.ema_scope("Plotting"):
-                samples, denoise_row = self.sample(batch_size=N, return_intermediates=True)
-
-            log["samples"] = samples
-            log["denoise_row"] = self._get_rows_from_list(denoise_row)
-
-        if return_keys:
-            if np.intersect1d(list(log.keys()), return_keys).shape[0] == 0:
-                return log
-            else:
-                return {key: log[key] for key in return_keys}
-        return log
-
-    def configure_optimizers(self):
-        lr = self.learning_rate
-        params = list(self.model.parameters())
-        if self.learn_logvar:
-            params = params + [self.logvar]
-        opt = torch.optim.AdamW(params, lr=lr)
-        return opt
-
-
-class LatentDiffusion(DDPM):
-    """main class"""
-    def __init__(self,
-                 first_stage_config,
-                 cond_stage_config,
-                 num_timesteps_cond=None,
-                 cond_stage_key="image",
-                 cond_stage_trainable=False,
-                 concat_mode=True,
-                 cond_stage_forward=None,
-                 conditioning_key=None,
-                 scale_factor=1.0,
-                 scale_by_std=False,
-                 use_fp16=True,
-                 *args, **kwargs):
-        self.num_timesteps_cond = default(num_timesteps_cond, 1)
-        self.scale_by_std = scale_by_std
-        assert self.num_timesteps_cond <= kwargs['timesteps']
-        # for backwards compatibility after implementation of DiffusionWrapper
-        if conditioning_key is None:
-            conditioning_key = 'concat' if concat_mode else 'crossattn'
-        if cond_stage_config == '__is_unconditional__':
-            conditioning_key = None
-        ckpt_path = kwargs.pop("ckpt_path", None)
-        ignore_keys = kwargs.pop("ignore_keys", [])
-        super().__init__(conditioning_key=conditioning_key, use_fp16=use_fp16, *args, **kwargs)
-        self.concat_mode = concat_mode
-        self.cond_stage_trainable = cond_stage_trainable
-        self.cond_stage_key = cond_stage_key
-        try:
-            self.num_downs = len(first_stage_config.params.ddconfig.ch_mult) - 1
-        except:
-            self.num_downs = 0
-        if not scale_by_std:
-            self.scale_factor = scale_factor
-        else:
-            self.register_buffer('scale_factor', torch.tensor(scale_factor))
-        self.first_stage_config = first_stage_config
-        self.cond_stage_config = cond_stage_config
-        if self.use_fp16:
-            self.cond_stage_config["params"].update({"use_fp16": True})
-            rank_zero_info("Using fp16 for conditioning stage = {}".format(self.cond_stage_config["params"]["use_fp16"]))
-        else:
-            self.cond_stage_config["params"].update({"use_fp16": False})
-            rank_zero_info("Using fp16 for conditioning stage = {}".format(self.cond_stage_config["params"]["use_fp16"]))
-        # self.instantiate_first_stage(first_stage_config)
-        # self.instantiate_cond_stage(cond_stage_config)
-        self.cond_stage_forward = cond_stage_forward
-        self.clip_denoised = False
-        self.bbox_tokenizer = None  
-
-        self.restarted_from_ckpt = False
-        if ckpt_path is not None:
-            self.init_from_ckpt(ckpt_path, ignore_keys)
-            self.restarted_from_ckpt = True
-
-
-
-    def configure_sharded_model(self) -> None:
-        self.model = DiffusionWrapper(self.unet_config, self.conditioning_key)
-        count_params(self.model, verbose=True)
-        if self.use_ema:
-            self.model_ema = LitEma(self.model)
-            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
-
-
-        self.register_schedule(given_betas=self.given_betas, beta_schedule=self.beta_schedule, timesteps=self.timesteps,
-                               linear_start=self.linear_start, linear_end=self.linear_end, cosine_s=self.cosine_s)
-
-        self.logvar = torch.full(fill_value=self.logvar_init, size=(self.num_timesteps,))
-        if self.learn_logvar:
-            self.logvar = nn.Parameter(self.logvar, requires_grad=True)
-            # self.logvar = nn.Parameter(self.logvar, requires_grad=True)
-        if self.ckpt_path is not None:
-            self.init_from_ckpt(self.ckpt_path, self.ignore_keys)
-            self.restarted_from_ckpt = True
-
-        # TODO()
-        # for p in self.model.modules():
-        #     if not p.parameters().data.is_contiguous:
-        #     p.data = p.data.contiguous()
-    
-        self.instantiate_first_stage(self.first_stage_config)
-        self.instantiate_cond_stage(self.cond_stage_config)
-
-    def make_cond_schedule(self, ):
-        self.cond_ids = torch.full(size=(self.num_timesteps,), fill_value=self.num_timesteps - 1, dtype=torch.long)
-        ids = torch.round(torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond)).long()
-        self.cond_ids[:self.num_timesteps_cond] = ids
-
-
-
-    @rank_zero_only
-    @torch.no_grad()
-    # def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
-    def on_train_batch_start(self, batch, batch_idx):
-        # only for very first batch
-        if self.scale_by_std and self.current_epoch == 0 and self.global_step == 0 and batch_idx == 0 and not self.restarted_from_ckpt:
-            assert self.scale_factor == 1., 'rather not use custom rescaling and std-rescaling simultaneously'
-            # set rescale weight to 1./std of encodings
-            print("### USING STD-RESCALING ###")
-            x = super().get_input(batch, self.first_stage_key)
-            x = x.to(self.device)
-            encoder_posterior = self.encode_first_stage(x)
-            z = self.get_first_stage_encoding(encoder_posterior).detach()
-            del self.scale_factor
-            self.register_buffer('scale_factor', 1. / z.flatten().std())
-            print(f"setting self.scale_factor to {self.scale_factor}")
-            print("### USING STD-RESCALING ###")
-
-    def register_schedule(self,
-                          given_betas=None, beta_schedule="linear", timesteps=1000,
-                          linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
-        super().register_schedule(given_betas, beta_schedule, timesteps, linear_start, linear_end, cosine_s)
-
-        self.shorten_cond_schedule = self.num_timesteps_cond > 1
-        if self.shorten_cond_schedule:
-            self.make_cond_schedule()
-
-    def instantiate_first_stage(self, config):
-        model = instantiate_from_config(config)
-        self.first_stage_model = model.eval()
-        self.first_stage_model.train = disabled_train
-        for param in self.first_stage_model.parameters():
-            param.requires_grad = False
-
-    def instantiate_cond_stage(self, config):
-        if not self.cond_stage_trainable:
-            if config == "__is_first_stage__":
-                print("Using first stage also as cond stage.")
-                self.cond_stage_model = self.first_stage_model
-            elif config == "__is_unconditional__":
-                print(f"Training {self.__class__.__name__} as an unconditional model.")
-                self.cond_stage_model = None
-                # self.be_unconditional = True
-            else:
-                model = instantiate_from_config(config)
-                self.cond_stage_model = model.eval()
-                self.cond_stage_model.train = disabled_train
-                for param in self.cond_stage_model.parameters():
-                    param.requires_grad = False
-        else:
-            assert config != '__is_first_stage__'
-            assert config != '__is_unconditional__'
-            model = instantiate_from_config(config)
-            self.cond_stage_model = model
-
-    def _get_denoise_row_from_list(self, samples, desc='', force_no_decoder_quantization=False):
-        denoise_row = []
-        for zd in tqdm(samples, desc=desc):
-            denoise_row.append(self.decode_first_stage(zd.to(self.device),
-                                                            force_not_quantize=force_no_decoder_quantization))
-        n_imgs_per_row = len(denoise_row)
-        denoise_row = torch.stack(denoise_row)  # n_log_step, n_row, C, H, W
-        denoise_grid = rearrange(denoise_row, 'n b c h w -> b n c h w')
-        denoise_grid = rearrange(denoise_grid, 'b n c h w -> (b n) c h w')
-        denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row)
-        return denoise_grid
-
-    def get_first_stage_encoding(self, encoder_posterior):
-        if isinstance(encoder_posterior, DiagonalGaussianDistribution):
-            z = encoder_posterior.sample()
-        elif isinstance(encoder_posterior, torch.Tensor):
-            z = encoder_posterior
-        else:
-            raise NotImplementedError(f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented")
-        return self.scale_factor * z
-
-    def get_learned_conditioning(self, c):
-        if self.cond_stage_forward is None:
-            if hasattr(self.cond_stage_model, 'encode') and callable(self.cond_stage_model.encode):
-                c = self.cond_stage_model.encode(c)
-                if isinstance(c, DiagonalGaussianDistribution):
-                    c = c.mode()
-            else:
-                c = self.cond_stage_model(c)
-        else:
-            assert hasattr(self.cond_stage_model, self.cond_stage_forward)
-            c = getattr(self.cond_stage_model, self.cond_stage_forward)(c)
-        return c
-
-    def meshgrid(self, h, w):
-        y = torch.arange(0, h).view(h, 1, 1).repeat(1, w, 1)
-        x = torch.arange(0, w).view(1, w, 1).repeat(h, 1, 1)
-
-        arr = torch.cat([y, x], dim=-1)
-        return arr
-
-    def delta_border(self, h, w):
-        """
-        :param h: height
-        :param w: width
-        :return: normalized distance to image border,
-         wtith min distance = 0 at border and max dist = 0.5 at image center
-        """
-        lower_right_corner = torch.tensor([h - 1, w - 1]).view(1, 1, 2)
-        arr = self.meshgrid(h, w) / lower_right_corner
-        dist_left_up = torch.min(arr, dim=-1, keepdims=True)[0]
-        dist_right_down = torch.min(1 - arr, dim=-1, keepdims=True)[0]
-        edge_dist = torch.min(torch.cat([dist_left_up, dist_right_down], dim=-1), dim=-1)[0]
-        return edge_dist
-
-    def get_weighting(self, h, w, Ly, Lx, device):
-        weighting = self.delta_border(h, w)
-        weighting = torch.clip(weighting, self.split_input_params["clip_min_weight"],
-                               self.split_input_params["clip_max_weight"], )
-        weighting = weighting.view(1, h * w, 1).repeat(1, 1, Ly * Lx).to(device)
-
-        if self.split_input_params["tie_braker"]:
-            L_weighting = self.delta_border(Ly, Lx)
-            L_weighting = torch.clip(L_weighting,
-                                     self.split_input_params["clip_min_tie_weight"],
-                                     self.split_input_params["clip_max_tie_weight"])
-
-            L_weighting = L_weighting.view(1, 1, Ly * Lx).to(device)
-            weighting = weighting * L_weighting
-        return weighting
-
-    def get_fold_unfold(self, x, kernel_size, stride, uf=1, df=1):  # todo load once not every time, shorten code
-        """
-        :param x: img of size (bs, c, h, w)
-        :return: n img crops of size (n, bs, c, kernel_size[0], kernel_size[1])
-        """
-        bs, nc, h, w = x.shape
-
-        # number of crops in image
-        Ly = (h - kernel_size[0]) // stride[0] + 1
-        Lx = (w - kernel_size[1]) // stride[1] + 1
-
-        if uf == 1 and df == 1:
-            fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
-            unfold = torch.nn.Unfold(**fold_params)
-
-            fold = torch.nn.Fold(output_size=x.shape[2:], **fold_params)
-
-            weighting = self.get_weighting(kernel_size[0], kernel_size[1], Ly, Lx, x.device).to(x.dtype)
-            normalization = fold(weighting).view(1, 1, h, w)  # normalizes the overlap
-            weighting = weighting.view((1, 1, kernel_size[0], kernel_size[1], Ly * Lx))
-
-        elif uf > 1 and df == 1:
-            fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
-            unfold = torch.nn.Unfold(**fold_params)
-
-            fold_params2 = dict(kernel_size=(kernel_size[0] * uf, kernel_size[0] * uf),
-                                dilation=1, padding=0,
-                                stride=(stride[0] * uf, stride[1] * uf))
-            fold = torch.nn.Fold(output_size=(x.shape[2] * uf, x.shape[3] * uf), **fold_params2)
-
-            weighting = self.get_weighting(kernel_size[0] * uf, kernel_size[1] * uf, Ly, Lx, x.device).to(x.dtype)
-            normalization = fold(weighting).view(1, 1, h * uf, w * uf)  # normalizes the overlap
-            weighting = weighting.view((1, 1, kernel_size[0] * uf, kernel_size[1] * uf, Ly * Lx))
-
-        elif df > 1 and uf == 1:
-            fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
-            unfold = torch.nn.Unfold(**fold_params)
-
-            fold_params2 = dict(kernel_size=(kernel_size[0] // df, kernel_size[0] // df),
-                                dilation=1, padding=0,
-                                stride=(stride[0] // df, stride[1] // df))
-            fold = torch.nn.Fold(output_size=(x.shape[2] // df, x.shape[3] // df), **fold_params2)
-
-            weighting = self.get_weighting(kernel_size[0] // df, kernel_size[1] // df, Ly, Lx, x.device).to(x.dtype)
-            normalization = fold(weighting).view(1, 1, h // df, w // df)  # normalizes the overlap
-            weighting = weighting.view((1, 1, kernel_size[0] // df, kernel_size[1] // df, Ly * Lx))
-
-        else:
-            raise NotImplementedError
-
-        return fold, unfold, normalization, weighting
-
-    @torch.no_grad()
-    def get_input(self, batch, k, return_first_stage_outputs=False, force_c_encode=False,
-                  cond_key=None, return_original_cond=False, bs=None):
-        x = super().get_input(batch, k)
-        if bs is not None:
-            x = x[:bs]
-        x = x.to(self.device)
-        encoder_posterior = self.encode_first_stage(x)
-        z = self.get_first_stage_encoding(encoder_posterior).detach()
-
-        if self.model.conditioning_key is not None:
-            if cond_key is None:
-                cond_key = self.cond_stage_key
-            if cond_key != self.first_stage_key:
-                if cond_key in ['caption', 'coordinates_bbox', 'txt']:
-                    xc = batch[cond_key]
-                elif cond_key == 'class_label':
-                    xc = batch
-                else:
-                    xc = super().get_input(batch, cond_key).to(self.device)
-            else:
-                xc = x
-            if not self.cond_stage_trainable or force_c_encode:
-                if isinstance(xc, dict) or isinstance(xc, list):
-                    # import pudb; pudb.set_trace()
-                    c = self.get_learned_conditioning(xc)
-                else:
-                    c = self.get_learned_conditioning(xc.to(self.device))
-            else:
-                c = xc
-            if bs is not None:
-                c = c[:bs]
-
-            if self.use_positional_encodings:
-                pos_x, pos_y = self.compute_latent_shifts(batch)
-                ckey = __conditioning_keys__[self.model.conditioning_key]
-                c = {ckey: c, 'pos_x': pos_x, 'pos_y': pos_y}
-
-        else:
-            c = None
-            xc = None
-            if self.use_positional_encodings:
-                pos_x, pos_y = self.compute_latent_shifts(batch)
-                c = {'pos_x': pos_x, 'pos_y': pos_y}
-        out = [z, c]
-        if return_first_stage_outputs:
-            xrec = self.decode_first_stage(z)
-            out.extend([x, xrec])
-        if return_original_cond:
-            out.append(xc)
-        return out
-
-    @torch.no_grad()
-    def decode_first_stage(self, z, predict_cids=False, force_not_quantize=False):
-        if predict_cids:
-            if z.dim() == 4:
-                z = torch.argmax(z.exp(), dim=1).long()
-            z = self.first_stage_model.quantize.get_codebook_entry(z, shape=None)
-            z = rearrange(z, 'b h w c -> b c h w').contiguous()
-
-        z = 1. / self.scale_factor * z
-
-        if hasattr(self, "split_input_params"):
-            if self.split_input_params["patch_distributed_vq"]:
-                ks = self.split_input_params["ks"]  # eg. (128, 128)
-                stride = self.split_input_params["stride"]  # eg. (64, 64)
-                uf = self.split_input_params["vqf"]
-                bs, nc, h, w = z.shape
-                if ks[0] > h or ks[1] > w:
-                    ks = (min(ks[0], h), min(ks[1], w))
-                    print("reducing Kernel")
-
-                if stride[0] > h or stride[1] > w:
-                    stride = (min(stride[0], h), min(stride[1], w))
-                    print("reducing stride")
-
-                fold, unfold, normalization, weighting = self.get_fold_unfold(z, ks, stride, uf=uf)
-
-                z = unfold(z)  # (bn, nc * prod(**ks), L)
-                # 1. Reshape to img shape
-                z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
-
-                # 2. apply model loop over last dim
-                if isinstance(self.first_stage_model, VQModelInterface):
-                    output_list = [self.first_stage_model.decode(z[:, :, :, :, i],
-                                                                 force_not_quantize=predict_cids or force_not_quantize)
-                                   for i in range(z.shape[-1])]
-                else:
-
-                    output_list = [self.first_stage_model.decode(z[:, :, :, :, i])
-                                   for i in range(z.shape[-1])]
-
-                o = torch.stack(output_list, axis=-1)  # # (bn, nc, ks[0], ks[1], L)
-                o = o * weighting
-                # Reverse 1. reshape to img shape
-                o = o.view((o.shape[0], -1, o.shape[-1]))  # (bn, nc * ks[0] * ks[1], L)
-                # stitch crops together
-                decoded = fold(o)
-                decoded = decoded / normalization  # norm is shape (1, 1, h, w)
-                return decoded
-            else:
-                if isinstance(self.first_stage_model, VQModelInterface):
-                    return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
-                else:
-                    return self.first_stage_model.decode(z)
-
-        else:
-            if isinstance(self.first_stage_model, VQModelInterface):
-                return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
-            else:
-                return self.first_stage_model.decode(z)
-
-    # same as above but without decorator
-    def differentiable_decode_first_stage(self, z, predict_cids=False, force_not_quantize=False):
-        if predict_cids:
-            if z.dim() == 4:
-                z = torch.argmax(z.exp(), dim=1).long()
-            z = self.first_stage_model.quantize.get_codebook_entry(z, shape=None)
-            z = rearrange(z, 'b h w c -> b c h w').contiguous()
-
-        z = 1. / self.scale_factor * z
-
-        if hasattr(self, "split_input_params"):
-            if self.split_input_params["patch_distributed_vq"]:
-                ks = self.split_input_params["ks"]  # eg. (128, 128)
-                stride = self.split_input_params["stride"]  # eg. (64, 64)
-                uf = self.split_input_params["vqf"]
-                bs, nc, h, w = z.shape
-                if ks[0] > h or ks[1] > w:
-                    ks = (min(ks[0], h), min(ks[1], w))
-                    print("reducing Kernel")
-
-                if stride[0] > h or stride[1] > w:
-                    stride = (min(stride[0], h), min(stride[1], w))
-                    print("reducing stride")
-
-                fold, unfold, normalization, weighting = self.get_fold_unfold(z, ks, stride, uf=uf)
-
-                z = unfold(z)  # (bn, nc * prod(**ks), L)
-                # 1. Reshape to img shape
-                z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
-
-                # 2. apply model loop over last dim
-                if isinstance(self.first_stage_model, VQModelInterface):  
-                    output_list = [self.first_stage_model.decode(z[:, :, :, :, i],
-                                                                 force_not_quantize=predict_cids or force_not_quantize)
-                                   for i in range(z.shape[-1])]
-                else:
-
-                    output_list = [self.first_stage_model.decode(z[:, :, :, :, i])
-                                   for i in range(z.shape[-1])]
-
-                o = torch.stack(output_list, axis=-1)  # # (bn, nc, ks[0], ks[1], L)
-                o = o * weighting
-                # Reverse 1. reshape to img shape
-                o = o.view((o.shape[0], -1, o.shape[-1]))  # (bn, nc * ks[0] * ks[1], L)
-                # stitch crops together
-                decoded = fold(o)
-                decoded = decoded / normalization  # norm is shape (1, 1, h, w)
-                return decoded
-            else:
-                if isinstance(self.first_stage_model, VQModelInterface):
-                    return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
-                else:
-                    return self.first_stage_model.decode(z)
-
-        else:
-            if isinstance(self.first_stage_model, VQModelInterface):
-                return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
-            else:
-                return self.first_stage_model.decode(z)
-
-    @torch.no_grad()
-    def encode_first_stage(self, x):
-        if hasattr(self, "split_input_params"):
-            if self.split_input_params["patch_distributed_vq"]:
-                ks = self.split_input_params["ks"]  # eg. (128, 128)
-                stride = self.split_input_params["stride"]  # eg. (64, 64)
-                df = self.split_input_params["vqf"]
-                self.split_input_params['original_image_size'] = x.shape[-2:]
-                bs, nc, h, w = x.shape
-                if ks[0] > h or ks[1] > w:
-                    ks = (min(ks[0], h), min(ks[1], w))
-                    print("reducing Kernel")
-
-                if stride[0] > h or stride[1] > w:
-                    stride = (min(stride[0], h), min(stride[1], w))
-                    print("reducing stride")
-
-                fold, unfold, normalization, weighting = self.get_fold_unfold(x, ks, stride, df=df)
-                z = unfold(x)  # (bn, nc * prod(**ks), L)
-                # Reshape to img shape
-                z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
-
-                output_list = [self.first_stage_model.encode(z[:, :, :, :, i])
-                               for i in range(z.shape[-1])]
-
-                o = torch.stack(output_list, axis=-1)
-                o = o * weighting
-
-                # Reverse reshape to img shape
-                o = o.view((o.shape[0], -1, o.shape[-1]))  # (bn, nc * ks[0] * ks[1], L)
-                # stitch crops together
-                decoded = fold(o)
-                decoded = decoded / normalization
-                return decoded
-
-            else:
-                return self.first_stage_model.encode(x)
-        else:
-            return self.first_stage_model.encode(x)
-
-    def shared_step(self, batch, **kwargs):
-        x, c = self.get_input(batch, self.first_stage_key)
-        loss = self(x, c)
-        return loss
-
-    def forward(self, x, c, *args, **kwargs):
-        t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=self.device).long()
-        if self.model.conditioning_key is not None:
-            assert c is not None
-            if self.cond_stage_trainable:
-                c = self.get_learned_conditioning(c)
-            if self.shorten_cond_schedule:  # TODO: drop this option
-                tc = self.cond_ids[t].to(self.device)
-                c = self.q_sample(x_start=c, t=tc, noise=torch.randn_like(c.float()))
-        return self.p_losses(x, c, t, *args, **kwargs)
-
-    def _rescale_annotations(self, bboxes, crop_coordinates):  # TODO: move to dataset
-        def rescale_bbox(bbox):
-            x0 = clamp((bbox[0] - crop_coordinates[0]) / crop_coordinates[2])
-            y0 = clamp((bbox[1] - crop_coordinates[1]) / crop_coordinates[3])
-            w = min(bbox[2] / crop_coordinates[2], 1 - x0)
-            h = min(bbox[3] / crop_coordinates[3], 1 - y0)
-            return x0, y0, w, h
-
-        return [rescale_bbox(b) for b in bboxes]
-
-    def apply_model(self, x_noisy, t, cond, return_ids=False):
-        if isinstance(cond, dict):
-            # hybrid case, cond is exptected to be a dict
-            pass
-        else:
-            if not isinstance(cond, list):
-                cond = [cond]
-            key = 'c_concat' if self.model.conditioning_key == 'concat' else 'c_crossattn'
-            cond = {key: cond}
-
-        if hasattr(self, "split_input_params"):
-            assert len(cond) == 1  # todo can only deal with one conditioning atm
-            assert not return_ids
-            ks = self.split_input_params["ks"]  # eg. (128, 128)
-            stride = self.split_input_params["stride"]  # eg. (64, 64)
-
-            h, w = x_noisy.shape[-2:]
-
-            fold, unfold, normalization, weighting = self.get_fold_unfold(x_noisy, ks, stride)
-
-            z = unfold(x_noisy)  # (bn, nc * prod(**ks), L)
-            # Reshape to img shape
-            z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
-            z_list = [z[:, :, :, :, i] for i in range(z.shape[-1])]
-            if self.cond_stage_key in ["image", "LR_image", "segmentation",
-                                       'bbox_img'] and self.model.conditioning_key:  # todo check for completeness
-                c_key = next(iter(cond.keys()))  # get key
-                c = next(iter(cond.values()))  # get value
-                assert (len(c) == 1)  # todo extend to list with more than one elem
-                c = c[0]  # get element
-
-                c = unfold(c)
-                c = c.view((c.shape[0], -1, ks[0], ks[1], c.shape[-1]))  # (bn, nc, ks[0], ks[1], L )
-
-                cond_list = [{c_key: [c[:, :, :, :, i]]} for i in range(c.shape[-1])]
-
-            elif self.cond_stage_key == 'coordinates_bbox':
-                assert 'original_image_size' in self.split_input_params, 'BoudingBoxRescaling is missing original_image_size'
-
-                # assuming padding of unfold is always 0 and its dilation is always 1
-                n_patches_per_row = int((w - ks[0]) / stride[0] + 1)
-                full_img_h, full_img_w = self.split_input_params['original_image_size']
-                # as we are operating on latents, we need the factor from the original image size to the
-                # spatial latent size to properly rescale the crops for regenerating the bbox annotations
-                num_downs = self.first_stage_model.encoder.num_resolutions - 1
-                rescale_latent = 2 ** (num_downs)
-
-                # get top left postions of patches as conforming for the bbbox tokenizer, therefore we
-                # need to rescale the tl patch coordinates to be in between (0,1)
-                tl_patch_coordinates = [(rescale_latent * stride[0] * (patch_nr % n_patches_per_row) / full_img_w,
-                                         rescale_latent * stride[1] * (patch_nr // n_patches_per_row) / full_img_h)
-                                        for patch_nr in range(z.shape[-1])]
-
-                # patch_limits are tl_coord, width and height coordinates as (x_tl, y_tl, h, w)
-                patch_limits = [(x_tl, y_tl,
-                                 rescale_latent * ks[0] / full_img_w,
-                                 rescale_latent * ks[1] / full_img_h) for x_tl, y_tl in tl_patch_coordinates]
-                # patch_values = [(np.arange(x_tl,min(x_tl+ks, 1.)),np.arange(y_tl,min(y_tl+ks, 1.))) for x_tl, y_tl in tl_patch_coordinates]
-
-                # tokenize crop coordinates for the bounding boxes of the respective patches
-                patch_limits_tknzd = [torch.LongTensor(self.bbox_tokenizer._crop_encoder(bbox))[None].to(self.device)
-                                      for bbox in patch_limits]  # list of length l with tensors of shape (1, 2)
-                print(patch_limits_tknzd[0].shape)
-                # cut tknzd crop position from conditioning
-                assert isinstance(cond, dict), 'cond must be dict to be fed into model'
-                cut_cond = cond['c_crossattn'][0][..., :-2].to(self.device)
-                print(cut_cond.shape)
-
-                adapted_cond = torch.stack([torch.cat([cut_cond, p], dim=1) for p in patch_limits_tknzd])
-                adapted_cond = rearrange(adapted_cond, 'l b n -> (l b) n')
-                print(adapted_cond.shape)
-                adapted_cond = self.get_learned_conditioning(adapted_cond)
-                print(adapted_cond.shape)
-                adapted_cond = rearrange(adapted_cond, '(l b) n d -> l b n d', l=z.shape[-1])
-                print(adapted_cond.shape)
-
-                cond_list = [{'c_crossattn': [e]} for e in adapted_cond]
-
-            else:
-                cond_list = [cond for i in range(z.shape[-1])]  # Todo make this more efficient
-
-            # apply model by loop over crops
-            output_list = [self.model(z_list[i], t, **cond_list[i]) for i in range(z.shape[-1])]
-            assert not isinstance(output_list[0],
-                                  tuple)  # todo cant deal with multiple model outputs check this never happens
-
-            o = torch.stack(output_list, axis=-1)
-            o = o * weighting
-            # Reverse reshape to img shape
-            o = o.view((o.shape[0], -1, o.shape[-1]))  # (bn, nc * ks[0] * ks[1], L)
-            # stitch crops together
-            x_recon = fold(o) / normalization
-
-        else:
-            x_recon = self.model(x_noisy, t, **cond)
-
-        if isinstance(x_recon, tuple) and not return_ids:
-            return x_recon[0]
-        else:
-            return x_recon
-
-    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
-        return (extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart) / \
-               extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
-
-    def _prior_bpd(self, x_start):
-        """
-        Get the prior KL term for the variational lower-bound, measured in
-        bits-per-dim.
-        This term can't be optimized, as it only depends on the encoder.
-        :param x_start: the [N x C x ...] tensor of inputs.
-        :return: a batch of [N] KL values (in bits), one per batch element.
-        """
-        batch_size = x_start.shape[0]
-        t = torch.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
-        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
-        kl_prior = normal_kl(mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0)
-        return mean_flat(kl_prior) / np.log(2.0)
-
-    def p_losses(self, x_start, cond, t, noise=None):
-        noise = default(noise, lambda: torch.randn_like(x_start))
-        x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
-        model_output = self.apply_model(x_noisy, t, cond)
-
-        loss_dict = {}
-        prefix = 'train' if self.training else 'val'
-
-        if self.parameterization == "x0":
-            target = x_start
-        elif self.parameterization == "eps":
-            target = noise
-        else:
-            raise NotImplementedError()
-
-        loss_simple = self.get_loss(model_output, target, mean=False).mean([1, 2, 3])
-        loss_dict.update({f'{prefix}/loss_simple': loss_simple.mean()})
-
-        logvar_t = self.logvar[t].to(self.device)
-        loss = loss_simple / torch.exp(logvar_t) + logvar_t
-        # loss = loss_simple / torch.exp(self.logvar) + self.logvar
-        if self.learn_logvar:
-            loss_dict.update({f'{prefix}/loss_gamma': loss.mean()})
-            loss_dict.update({'logvar': self.logvar.data.mean()})
-
-        loss = self.l_simple_weight * loss.mean()
-
-        loss_vlb = self.get_loss(model_output, target, mean=False).mean(dim=(1, 2, 3))
-        loss_vlb = (self.lvlb_weights[t] * loss_vlb).mean()
-        loss_dict.update({f'{prefix}/loss_vlb': loss_vlb})
-        loss += (self.original_elbo_weight * loss_vlb)
-        loss_dict.update({f'{prefix}/loss': loss})
-
-        return loss, loss_dict
-
-    def p_mean_variance(self, x, c, t, clip_denoised: bool, return_codebook_ids=False, quantize_denoised=False,
-                        return_x0=False, score_corrector=None, corrector_kwargs=None):
-        t_in = t
-        model_out = self.apply_model(x, t_in, c, return_ids=return_codebook_ids)
-
-        if score_corrector is not None:
-            assert self.parameterization == "eps"
-            model_out = score_corrector.modify_score(self, model_out, x, t, c, **corrector_kwargs)
-
-        if return_codebook_ids:
-            model_out, logits = model_out
-
-        if self.parameterization == "eps":
-            x_recon = self.predict_start_from_noise(x, t=t, noise=model_out)
-        elif self.parameterization == "x0":
-            x_recon = model_out
-        else:
-            raise NotImplementedError()
-
-        if clip_denoised:
-            x_recon.clamp_(-1., 1.)
-        if quantize_denoised:
-            x_recon, _, [_, _, indices] = self.first_stage_model.quantize(x_recon)
-        model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
-        if return_codebook_ids:
-            return model_mean, posterior_variance, posterior_log_variance, logits
-        elif return_x0:
-            return model_mean, posterior_variance, posterior_log_variance, x_recon
-        else:
-            return model_mean, posterior_variance, posterior_log_variance
-
-    @torch.no_grad()
-    def p_sample(self, x, c, t, clip_denoised=False, repeat_noise=False,
-                 return_codebook_ids=False, quantize_denoised=False, return_x0=False,
-                 temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None):
-        b, *_, device = *x.shape, x.device
-        outputs = self.p_mean_variance(x=x, c=c, t=t, clip_denoised=clip_denoised,
-                                       return_codebook_ids=return_codebook_ids,
-                                       quantize_denoised=quantize_denoised,
-                                       return_x0=return_x0,
-                                       score_corrector=score_corrector, corrector_kwargs=corrector_kwargs)
-        if return_codebook_ids:
-            raise DeprecationWarning("Support dropped.")
-            model_mean, _, model_log_variance, logits = outputs
-        elif return_x0:
-            model_mean, _, model_log_variance, x0 = outputs
-        else:
-            model_mean, _, model_log_variance = outputs
-
-        noise = noise_like(x.shape, device, repeat_noise) * temperature
-        if noise_dropout > 0.:
-            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
-        # no noise when t == 0
-        nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
-
-        if return_codebook_ids:
-            return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise, logits.argmax(dim=1)
-        if return_x0:
-            return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise, x0
-        else:
-            return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
-
-    @torch.no_grad()
-    def progressive_denoising(self, cond, shape, verbose=True, callback=None, quantize_denoised=False,
-                              img_callback=None, mask=None, x0=None, temperature=1., noise_dropout=0.,
-                              score_corrector=None, corrector_kwargs=None, batch_size=None, x_T=None, start_T=None,
-                              log_every_t=None):
-        if not log_every_t:
-            log_every_t = self.log_every_t
-        timesteps = self.num_timesteps
-        if batch_size is not None:
-            b = batch_size if batch_size is not None else shape[0]
-            shape = [batch_size] + list(shape)
-        else:
-            b = batch_size = shape[0]
-        if x_T is None:
-            img = torch.randn(shape, device=self.device)
-        else:
-            img = x_T
-        intermediates = []
-        if cond is not None:
-            if isinstance(cond, dict):
-                cond = {key: cond[key][:batch_size] if not isinstance(cond[key], list) else
-                list(map(lambda x: x[:batch_size], cond[key])) for key in cond}
-            else:
-                cond = [c[:batch_size] for c in cond] if isinstance(cond, list) else cond[:batch_size]
-
-        if start_T is not None:
-            timesteps = min(timesteps, start_T)
-        iterator = tqdm(reversed(range(0, timesteps)), desc='Progressive Generation',
-                        total=timesteps) if verbose else reversed(
-            range(0, timesteps))
-        if type(temperature) == float:
-            temperature = [temperature] * timesteps
-
-        for i in iterator:
-            ts = torch.full((b,), i, device=self.device, dtype=torch.long)
-            if self.shorten_cond_schedule:
-                assert self.model.conditioning_key != 'hybrid'
-                tc = self.cond_ids[ts].to(cond.device)
-                cond = self.q_sample(x_start=cond, t=tc, noise=torch.randn_like(cond))
-
-            img, x0_partial = self.p_sample(img, cond, ts,
-                                            clip_denoised=self.clip_denoised,
-                                            quantize_denoised=quantize_denoised, return_x0=True,
-                                            temperature=temperature[i], noise_dropout=noise_dropout,
-                                            score_corrector=score_corrector, corrector_kwargs=corrector_kwargs)
-            if mask is not None:
-                assert x0 is not None
-                img_orig = self.q_sample(x0, ts)
-                img = img_orig * mask + (1. - mask) * img
-
-            if i % log_every_t == 0 or i == timesteps - 1:
-                intermediates.append(x0_partial)
-            if callback: callback(i)
-            if img_callback: img_callback(img, i)
-        return img, intermediates
-
-    @torch.no_grad()
-    def p_sample_loop(self, cond, shape, return_intermediates=False,
-                      x_T=None, verbose=True, callback=None, timesteps=None, quantize_denoised=False,
-                      mask=None, x0=None, img_callback=None, start_T=None,
-                      log_every_t=None):
-
-        if not log_every_t:
-            log_every_t = self.log_every_t
-        device = self.betas.device
-        b = shape[0]
-        if x_T is None:
-            img = torch.randn(shape, device=device)
-        else:
-            img = x_T
-
-        intermediates = [img]
-        if timesteps is None:
-            timesteps = self.num_timesteps
-
-        if start_T is not None:
-            timesteps = min(timesteps, start_T)
-        iterator = tqdm(reversed(range(0, timesteps)), desc='Sampling t', total=timesteps) if verbose else reversed(
-            range(0, timesteps))
-
-        if mask is not None:
-            assert x0 is not None
-            assert x0.shape[2:3] == mask.shape[2:3]  # spatial size has to match
-
-        for i in iterator:
-            ts = torch.full((b,), i, device=device, dtype=torch.long)
-            if self.shorten_cond_schedule:
-                assert self.model.conditioning_key != 'hybrid'
-                tc = self.cond_ids[ts].to(cond.device)
-                cond = self.q_sample(x_start=cond, t=tc, noise=torch.randn_like(cond))
-
-            img = self.p_sample(img, cond, ts,
-                                clip_denoised=self.clip_denoised,
-                                quantize_denoised=quantize_denoised)
-            if mask is not None:
-                img_orig = self.q_sample(x0, ts)
-                img = img_orig * mask + (1. - mask) * img
-
-            if i % log_every_t == 0 or i == timesteps - 1:
-                intermediates.append(img)
-            if callback: callback(i)
-            if img_callback: img_callback(img, i)
-
-        if return_intermediates:
-            return img, intermediates
-        return img
-
-    @torch.no_grad()
-    def sample(self, cond, batch_size=16, return_intermediates=False, x_T=None,
-               verbose=True, timesteps=None, quantize_denoised=False,
-               mask=None, x0=None, shape=None,**kwargs):
-        if shape is None:
-            shape = (batch_size, self.channels, self.image_size, self.image_size)
-        if cond is not None:
-            if isinstance(cond, dict):
-                cond = {key: cond[key][:batch_size] if not isinstance(cond[key], list) else
-                list(map(lambda x: x[:batch_size], cond[key])) for key in cond}
-            else:
-                cond = [c[:batch_size] for c in cond] if isinstance(cond, list) else cond[:batch_size]
-        return self.p_sample_loop(cond,
-                                  shape,
-                                  return_intermediates=return_intermediates, x_T=x_T,
-                                  verbose=verbose, timesteps=timesteps, quantize_denoised=quantize_denoised,
-                                  mask=mask, x0=x0)
-
-    @torch.no_grad()
-    def sample_log(self,cond,batch_size,ddim, ddim_steps,**kwargs):
-
-        if ddim:
-            ddim_sampler = DDIMSampler(self)
-            shape = (self.channels, self.image_size, self.image_size)
-            samples, intermediates =ddim_sampler.sample(ddim_steps,batch_size,
-                                                        shape,cond,verbose=False,**kwargs)
-
-        else:
-            samples, intermediates = self.sample(cond=cond, batch_size=batch_size,
-                                                 return_intermediates=True,**kwargs)
-
-        return samples, intermediates
-
-
-    @torch.no_grad()
-    def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=1., return_keys=None,
-                   quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
-                   plot_diffusion_rows=True, **kwargs):
-
-        use_ddim = ddim_steps is not None
-
-        log = dict()
-        z, c, x, xrec, xc = self.get_input(batch, self.first_stage_key,
-                                           return_first_stage_outputs=True,
-                                           force_c_encode=True,
-                                           return_original_cond=True,
-                                           bs=N)
-        N = min(x.shape[0], N)
-        n_row = min(x.shape[0], n_row)
-        log["inputs"] = x
-        log["reconstruction"] = xrec
-        if self.model.conditioning_key is not None:
-            if hasattr(self.cond_stage_model, "decode"):
-                xc = self.cond_stage_model.decode(c)
-                log["conditioning"] = xc
-            elif self.cond_stage_key in ["caption"]:
-                xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["caption"])
-                log["conditioning"] = xc
-            elif self.cond_stage_key == 'class_label':
-                xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"])
-                log['conditioning'] = xc
-            elif isimage(xc):
-                log["conditioning"] = xc
-            if ismap(xc):
-                log["original_conditioning"] = self.to_rgb(xc)
-
-        if plot_diffusion_rows:
-            # get diffusion row
-            diffusion_row = list()
-            z_start = z[:n_row]
-            for t in range(self.num_timesteps):
-                if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
-                    t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
-                    t = t.to(self.device).long()
-                    noise = torch.randn_like(z_start)
-                    z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
-                    diffusion_row.append(self.decode_first_stage(z_noisy))
-
-            diffusion_row = torch.stack(diffusion_row)  # n_log_step, n_row, C, H, W
-            diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
-            diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
-            diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
-            log["diffusion_row"] = diffusion_grid
-
-        if sample:
-            # get denoise row
-            with self.ema_scope("Plotting"):
-                samples, z_denoise_row = self.sample_log(cond=c,batch_size=N,ddim=use_ddim,
-                                                         ddim_steps=ddim_steps,eta=ddim_eta)
-                # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
-            x_samples = self.decode_first_stage(samples)
-            log["samples"] = x_samples
-            if plot_denoise_rows:
-                denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
-                log["denoise_row"] = denoise_grid
-
-            if quantize_denoised and not isinstance(self.first_stage_model, AutoencoderKL) and not isinstance(
-                    self.first_stage_model, IdentityFirstStage):
-                # also display when quantizing x0 while sampling
-                with self.ema_scope("Plotting Quantized Denoised"):
-                    samples, z_denoise_row = self.sample_log(cond=c,batch_size=N,ddim=use_ddim,
-                                                             ddim_steps=ddim_steps,eta=ddim_eta,
-                                                             quantize_denoised=True)
-                    # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True,
-                    #                                      quantize_denoised=True)
-                x_samples = self.decode_first_stage(samples.to(self.device))
-                log["samples_x0_quantized"] = x_samples
-
-            if inpaint:
-                # make a simple center square
-                b, h, w = z.shape[0], z.shape[2], z.shape[3]
-                mask = torch.ones(N, h, w).to(self.device)
-                # zeros will be filled in
-                mask[:, h // 4:3 * h // 4, w // 4:3 * w // 4] = 0.
-                mask = mask[:, None, ...]
-                with self.ema_scope("Plotting Inpaint"):
-
-                    samples, _ = self.sample_log(cond=c,batch_size=N,ddim=use_ddim, eta=ddim_eta,
-                                                ddim_steps=ddim_steps, x0=z[:N], mask=mask)
-                x_samples = self.decode_first_stage(samples.to(self.device))
-                log["samples_inpainting"] = x_samples
-                log["mask"] = mask
-
-                # outpaint
-                with self.ema_scope("Plotting Outpaint"):
-                    samples, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,eta=ddim_eta,
-                                                ddim_steps=ddim_steps, x0=z[:N], mask=mask)
-                x_samples = self.decode_first_stage(samples.to(self.device))
-                log["samples_outpainting"] = x_samples
-
-        if plot_progressive_rows:
-            with self.ema_scope("Plotting Progressives"):
-                img, progressives = self.progressive_denoising(c,
-                                                               shape=(self.channels, self.image_size, self.image_size),
-                                                               batch_size=N)
-            prog_row = self._get_denoise_row_from_list(progressives, desc="Progressive Generation")
-            log["progressive_row"] = prog_row
-
-        if return_keys:
-            if np.intersect1d(list(log.keys()), return_keys).shape[0] == 0:
-                return log
-            else:
-                return {key: log[key] for key in return_keys}
-        return log
-
-    def configure_optimizers(self):
-        lr = self.learning_rate
-        params = list(self.model.parameters())
-        if self.cond_stage_trainable:
-            print(f"{self.__class__.__name__}: Also optimizing conditioner params!")
-            params = params + list(self.cond_stage_model.parameters())
-        if self.learn_logvar:
-            print('Diffusion model optimizing logvar')
-            params.append(self.logvar)
-        from colossalai.nn.optimizer import HybridAdam
-        opt = HybridAdam(params, lr=lr)
-        # opt = torch.optim.AdamW(params, lr=lr)
-        if self.use_scheduler:
-            assert 'target' in self.scheduler_config
-            scheduler = instantiate_from_config(self.scheduler_config)
-
-            rank_zero_info("Setting up LambdaLR scheduler...")
-            scheduler = [
-                {
-                    'scheduler': LambdaLR(opt, lr_lambda=scheduler.schedule),
-                    'interval': 'step',
-                    'frequency': 1
-                }]
-            return [opt], scheduler
-        return opt
-
-    @torch.no_grad()
-    def to_rgb(self, x):
-        x = x.float()
-        if not hasattr(self, "colorize"):
-            self.colorize = torch.randn(3, x.shape[1], 1, 1).to(x)
-        x = nn.functional.conv2d(x, weight=self.colorize)
-        x = 2. * (x - x.min()) / (x.max() - x.min()) - 1.
-        return x
-
-
-class DiffusionWrapper(pl.LightningModule):
-    def __init__(self, diff_model_config, conditioning_key):
-        super().__init__()
-        self.diffusion_model = instantiate_from_config(diff_model_config)
-        self.conditioning_key = conditioning_key
-        assert self.conditioning_key in [None, 'concat', 'crossattn', 'hybrid', 'adm']
-
-    def forward(self, x, t, c_concat: list = None, c_crossattn: list = None):
-        if self.conditioning_key is None:
-            out = self.diffusion_model(x, t)
-        elif self.conditioning_key == 'concat':
-            xc = torch.cat([x] + c_concat, dim=1)
-            out = self.diffusion_model(xc, t)
-        elif self.conditioning_key == 'crossattn':
-            cc = torch.cat(c_crossattn, 1)
-            out = self.diffusion_model(x, t, context=cc)
-        elif self.conditioning_key == 'hybrid':
-            xc = torch.cat([x] + c_concat, dim=1)
-            cc = torch.cat(c_crossattn, 1)
-            out = self.diffusion_model(xc, t, context=cc)
-        elif self.conditioning_key == 'adm':
-            cc = c_crossattn[0]
-            out = self.diffusion_model(x, t, y=cc)
-        else:
-            raise NotImplementedError()
-
-        return out
-
-
-class Layout2ImgDiffusion(LatentDiffusion):
-    # TODO: move all layout-specific hacks to this class
-    def __init__(self, cond_stage_key, *args, **kwargs):
-        assert cond_stage_key == 'coordinates_bbox', 'Layout2ImgDiffusion only for cond_stage_key="coordinates_bbox"'
-        super().__init__(cond_stage_key=cond_stage_key, *args, **kwargs)
-
-    def log_images(self, batch, N=8, *args, **kwargs):
-        logs = super().log_images(batch=batch, N=N, *args, **kwargs)
-
-        key = 'train' if self.training else 'validation'
-        dset = self.trainer.datamodule.datasets[key]
-        mapper = dset.conditional_builders[self.cond_stage_key]
-
-        bbox_imgs = []
-        map_fn = lambda catno: dset.get_textual_label(dset.get_category_id(catno))
-        for tknzd_bbox in batch[self.cond_stage_key][:N]:
-            bboximg = mapper.plot(tknzd_bbox.detach().cpu(), map_fn, (256, 256))
-            bbox_imgs.append(bboximg)
-
-        cond_img = torch.stack(bbox_imgs, dim=0)
-        logs['bbox_image'] = cond_img
-        return logs
diff --git a/examples/tutorial/stable_diffusion/ldm/models/diffusion/plms.py b/examples/tutorial/stable_diffusion/ldm/models/diffusion/plms.py
deleted file mode 100644
index 78eeb1003aa4..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/models/diffusion/plms.py
+++ /dev/null
@@ -1,236 +0,0 @@
-"""SAMPLING ONLY."""
-
-import torch
-import numpy as np
-from tqdm import tqdm
-from functools import partial
-
-from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like
-
-
-class PLMSSampler(object):
-    def __init__(self, model, schedule="linear", **kwargs):
-        super().__init__()
-        self.model = model
-        self.ddpm_num_timesteps = model.num_timesteps
-        self.schedule = schedule
-
-    def register_buffer(self, name, attr):
-        if type(attr) == torch.Tensor:
-            if attr.device != torch.device("cuda"):
-                attr = attr.to(torch.device("cuda"))
-        setattr(self, name, attr)
-
-    def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
-        if ddim_eta != 0:
-            raise ValueError('ddim_eta must be 0 for PLMS')
-        self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
-                                                  num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
-        alphas_cumprod = self.model.alphas_cumprod
-        assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
-        to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
-
-        self.register_buffer('betas', to_torch(self.model.betas))
-        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
-        self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
-
-        # calculations for diffusion q(x_t | x_{t-1}) and others
-        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
-        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
-
-        # ddim sampling parameters
-        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
-                                                                                   ddim_timesteps=self.ddim_timesteps,
-                                                                                   eta=ddim_eta,verbose=verbose)
-        self.register_buffer('ddim_sigmas', ddim_sigmas)
-        self.register_buffer('ddim_alphas', ddim_alphas)
-        self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
-        self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
-        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
-            (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
-                        1 - self.alphas_cumprod / self.alphas_cumprod_prev))
-        self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
-
-    @torch.no_grad()
-    def sample(self,
-               S,
-               batch_size,
-               shape,
-               conditioning=None,
-               callback=None,
-               normals_sequence=None,
-               img_callback=None,
-               quantize_x0=False,
-               eta=0.,
-               mask=None,
-               x0=None,
-               temperature=1.,
-               noise_dropout=0.,
-               score_corrector=None,
-               corrector_kwargs=None,
-               verbose=True,
-               x_T=None,
-               log_every_t=100,
-               unconditional_guidance_scale=1.,
-               unconditional_conditioning=None,
-               # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
-               **kwargs
-               ):
-        if conditioning is not None:
-            if isinstance(conditioning, dict):
-                cbs = conditioning[list(conditioning.keys())[0]].shape[0]
-                if cbs != batch_size:
-                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
-            else:
-                if conditioning.shape[0] != batch_size:
-                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
-
-        self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
-        # sampling
-        C, H, W = shape
-        size = (batch_size, C, H, W)
-        print(f'Data shape for PLMS sampling is {size}')
-
-        samples, intermediates = self.plms_sampling(conditioning, size,
-                                                    callback=callback,
-                                                    img_callback=img_callback,
-                                                    quantize_denoised=quantize_x0,
-                                                    mask=mask, x0=x0,
-                                                    ddim_use_original_steps=False,
-                                                    noise_dropout=noise_dropout,
-                                                    temperature=temperature,
-                                                    score_corrector=score_corrector,
-                                                    corrector_kwargs=corrector_kwargs,
-                                                    x_T=x_T,
-                                                    log_every_t=log_every_t,
-                                                    unconditional_guidance_scale=unconditional_guidance_scale,
-                                                    unconditional_conditioning=unconditional_conditioning,
-                                                    )
-        return samples, intermediates
-
-    @torch.no_grad()
-    def plms_sampling(self, cond, shape,
-                      x_T=None, ddim_use_original_steps=False,
-                      callback=None, timesteps=None, quantize_denoised=False,
-                      mask=None, x0=None, img_callback=None, log_every_t=100,
-                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None,):
-        device = self.model.betas.device
-        b = shape[0]
-        if x_T is None:
-            img = torch.randn(shape, device=device)
-        else:
-            img = x_T
-
-        if timesteps is None:
-            timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
-        elif timesteps is not None and not ddim_use_original_steps:
-            subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
-            timesteps = self.ddim_timesteps[:subset_end]
-
-        intermediates = {'x_inter': [img], 'pred_x0': [img]}
-        time_range = list(reversed(range(0,timesteps))) if ddim_use_original_steps else np.flip(timesteps)
-        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
-        print(f"Running PLMS Sampling with {total_steps} timesteps")
-
-        iterator = tqdm(time_range, desc='PLMS Sampler', total=total_steps)
-        old_eps = []
-
-        for i, step in enumerate(iterator):
-            index = total_steps - i - 1
-            ts = torch.full((b,), step, device=device, dtype=torch.long)
-            ts_next = torch.full((b,), time_range[min(i + 1, len(time_range) - 1)], device=device, dtype=torch.long)
-
-            if mask is not None:
-                assert x0 is not None
-                img_orig = self.model.q_sample(x0, ts)  # TODO: deterministic forward pass?
-                img = img_orig * mask + (1. - mask) * img
-
-            outs = self.p_sample_plms(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
-                                      quantize_denoised=quantize_denoised, temperature=temperature,
-                                      noise_dropout=noise_dropout, score_corrector=score_corrector,
-                                      corrector_kwargs=corrector_kwargs,
-                                      unconditional_guidance_scale=unconditional_guidance_scale,
-                                      unconditional_conditioning=unconditional_conditioning,
-                                      old_eps=old_eps, t_next=ts_next)
-            img, pred_x0, e_t = outs
-            old_eps.append(e_t)
-            if len(old_eps) >= 4:
-                old_eps.pop(0)
-            if callback: callback(i)
-            if img_callback: img_callback(pred_x0, i)
-
-            if index % log_every_t == 0 or index == total_steps - 1:
-                intermediates['x_inter'].append(img)
-                intermediates['pred_x0'].append(pred_x0)
-
-        return img, intermediates
-
-    @torch.no_grad()
-    def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
-                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None, old_eps=None, t_next=None):
-        b, *_, device = *x.shape, x.device
-
-        def get_model_output(x, t):
-            if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
-                e_t = self.model.apply_model(x, t, c)
-            else:
-                x_in = torch.cat([x] * 2)
-                t_in = torch.cat([t] * 2)
-                c_in = torch.cat([unconditional_conditioning, c])
-                e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
-                e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
-
-            if score_corrector is not None:
-                assert self.model.parameterization == "eps"
-                e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
-
-            return e_t
-
-        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
-        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
-        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
-        sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
-
-        def get_x_prev_and_pred_x0(e_t, index):
-            # select parameters corresponding to the currently considered timestep
-            a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
-            a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
-            sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
-            sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
-
-            # current prediction for x_0
-            pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
-            if quantize_denoised:
-                pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
-            # direction pointing to x_t
-            dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
-            noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
-            if noise_dropout > 0.:
-                noise = torch.nn.functional.dropout(noise, p=noise_dropout)
-            x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
-            return x_prev, pred_x0
-
-        e_t = get_model_output(x, t)
-        if len(old_eps) == 0:
-            # Pseudo Improved Euler (2nd order)
-            x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index)
-            e_t_next = get_model_output(x_prev, t_next)
-            e_t_prime = (e_t + e_t_next) / 2
-        elif len(old_eps) == 1:
-            # 2nd order Pseudo Linear Multistep (Adams-Bashforth)
-            e_t_prime = (3 * e_t - old_eps[-1]) / 2
-        elif len(old_eps) == 2:
-            # 3nd order Pseudo Linear Multistep (Adams-Bashforth)
-            e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
-        elif len(old_eps) >= 3:
-            # 4nd order Pseudo Linear Multistep (Adams-Bashforth)
-            e_t_prime = (55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]) / 24
-
-        x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
-
-        return x_prev, pred_x0, e_t
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/attention.py b/examples/tutorial/stable_diffusion/ldm/modules/attention.py
deleted file mode 100644
index 3401ceafddb4..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/attention.py
+++ /dev/null
@@ -1,314 +0,0 @@
-from inspect import isfunction
-import math
-import torch
-import torch.nn.functional as F
-from torch import nn, einsum
-from einops import rearrange, repeat
-
-from torch.utils import checkpoint
-
-try:
-    from ldm.modules.flash_attention import flash_attention_qkv, flash_attention_q_kv
-    FlASH_AVAILABLE = True
-except:
-    FlASH_AVAILABLE = False
-
-USE_FLASH = False
-
-
-def enable_flash_attention():
-    global USE_FLASH
-    USE_FLASH = True
-    if FlASH_AVAILABLE is False:
-        print("Please install flash attention to activate new attention kernel.\n" + 
-              "Use \'pip install git+https://github.com/HazyResearch/flash-attention.git@c422fee3776eb3ea24e011ef641fd5fbeb212623#egg=flash_attn\'")
-
-
-def exists(val):
-    return val is not None
-
-
-def uniq(arr):
-    return{el: True for el in arr}.keys()
-
-
-def default(val, d):
-    if exists(val):
-        return val
-    return d() if isfunction(d) else d
-
-
-def max_neg_value(t):
-    return -torch.finfo(t.dtype).max
-
-
-def init_(tensor):
-    dim = tensor.shape[-1]
-    std = 1 / math.sqrt(dim)
-    tensor.uniform_(-std, std)
-    return tensor
-
-
-# feedforward
-class GEGLU(nn.Module):
-    def __init__(self, dim_in, dim_out):
-        super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out * 2)
-
-    def forward(self, x):
-        x, gate = self.proj(x).chunk(2, dim=-1)
-        return x * F.gelu(gate)
-
-
-class FeedForward(nn.Module):
-    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
-        super().__init__()
-        inner_dim = int(dim * mult)
-        dim_out = default(dim_out, dim)
-        project_in = nn.Sequential(
-            nn.Linear(dim, inner_dim),
-            nn.GELU()
-        ) if not glu else GEGLU(dim, inner_dim)
-
-        self.net = nn.Sequential(
-            project_in,
-            nn.Dropout(dropout),
-            nn.Linear(inner_dim, dim_out)
-        )
-
-    def forward(self, x):
-        return self.net(x)
-
-
-def zero_module(module):
-    """
-    Zero out the parameters of a module and return it.
-    """
-    for p in module.parameters():
-        p.detach().zero_()
-    return module
-
-
-def Normalize(in_channels):
-    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
-
-
-class LinearAttention(nn.Module):
-    def __init__(self, dim, heads=4, dim_head=32):
-        super().__init__()
-        self.heads = heads
-        hidden_dim = dim_head * heads
-        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False)
-        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
-
-    def forward(self, x):
-        b, c, h, w = x.shape
-        qkv = self.to_qkv(x)
-        q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads = self.heads, qkv=3)
-        k = k.softmax(dim=-1)  
-        context = torch.einsum('bhdn,bhen->bhde', k, v)
-        out = torch.einsum('bhde,bhdn->bhen', context, q)
-        out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w)
-        return self.to_out(out)
-
-
-class SpatialSelfAttention(nn.Module):
-    def __init__(self, in_channels):
-        super().__init__()
-        self.in_channels = in_channels
-
-        self.norm = Normalize(in_channels)
-        self.q = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.k = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.v = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.proj_out = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=1,
-                                        stride=1,
-                                        padding=0)
-
-    def forward(self, x):
-        h_ = x
-        h_ = self.norm(h_)
-        q = self.q(h_)
-        k = self.k(h_)
-        v = self.v(h_)
-
-        # compute attention
-        b,c,h,w = q.shape
-        q = rearrange(q, 'b c h w -> b (h w) c')
-        k = rearrange(k, 'b c h w -> b c (h w)')
-        w_ = torch.einsum('bij,bjk->bik', q, k)
-
-        w_ = w_ * (int(c)**(-0.5))
-        w_ = torch.nn.functional.softmax(w_, dim=2)
-
-        # attend to values
-        v = rearrange(v, 'b c h w -> b c (h w)')
-        w_ = rearrange(w_, 'b i j -> b j i')
-        h_ = torch.einsum('bij,bjk->bik', v, w_)
-        h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h)
-        h_ = self.proj_out(h_)
-
-        return x+h_
-
-
-class CrossAttention(nn.Module):
-    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.):
-        super().__init__()
-        inner_dim = dim_head * heads
-        context_dim = default(context_dim, query_dim)
-
-        self.scale = dim_head ** -0.5
-        self.heads = heads
-
-        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
-        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
-        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
-
-        self.to_out = nn.Sequential(
-            nn.Linear(inner_dim, query_dim),
-            nn.Dropout(dropout)
-        )
-
-    def forward(self, x, context=None, mask=None):
-        q = self.to_q(x)
-        context = default(context, x)
-        k = self.to_k(context)
-        v = self.to_v(context)
-        dim_head = q.shape[-1] / self.heads
-
-        if USE_FLASH and FlASH_AVAILABLE and q.dtype in (torch.float16, torch.bfloat16) and \
-            dim_head <= 128 and (dim_head % 8) == 0:
-            # print("in flash")
-            if q.shape[1] == k.shape[1]:
-                out = self._flash_attention_qkv(q, k, v)
-            else:
-                out = self._flash_attention_q_kv(q, k, v)
-        else:
-            out = self._native_attention(q, k, v, self.heads, mask)
-
-        return self.to_out(out)
-
-    def _native_attention(self, q, k, v, h, mask):
-        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
-        sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
-        if exists(mask):
-            mask = rearrange(mask, 'b ... -> b (...)')
-            max_neg_value = -torch.finfo(sim.dtype).max
-            mask = repeat(mask, 'b j -> (b h) () j', h=h)
-            sim.masked_fill_(~mask, max_neg_value)
-        # attention, what we cannot get enough of
-        out = sim.softmax(dim=-1)
-        out = einsum('b i j, b j d -> b i d', out, v)
-        out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
-        return out
-
-    def _flash_attention_qkv(self, q, k, v):
-        qkv = torch.stack([q, k, v], dim=2)
-        b = qkv.shape[0]
-        n = qkv.shape[1]
-        qkv = rearrange(qkv, 'b n t (h d) -> (b n) t h d', h=self.heads)
-        out = flash_attention_qkv(qkv, self.scale, b, n)
-        out = rearrange(out, '(b n) h d -> b n (h d)', b=b, h=self.heads)
-        return out
-    
-    def _flash_attention_q_kv(self, q, k, v):
-        kv = torch.stack([k, v], dim=2)
-        b = q.shape[0]
-        q_seqlen = q.shape[1]
-        kv_seqlen = kv.shape[1]
-        q = rearrange(q, 'b n (h d) -> (b n) h d', h=self.heads)
-        kv = rearrange(kv, 'b n t (h d) -> (b n) t h d', h=self.heads)
-        out = flash_attention_q_kv(q, kv, self.scale, b, q_seqlen, kv_seqlen)
-        out = rearrange(out, '(b n) h d -> b n (h d)', b=b, h=self.heads)
-        return out
-
-
-class BasicTransformerBlock(nn.Module):
-    def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, use_checkpoint=False):
-        super().__init__()
-        self.attn1 = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout)  # is a self-attention
-        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
-        self.attn2 = CrossAttention(query_dim=dim, context_dim=context_dim,
-                                    heads=n_heads, dim_head=d_head, dropout=dropout)  # is self-attn if context is none
-        self.norm1 = nn.LayerNorm(dim)
-        self.norm2 = nn.LayerNorm(dim)
-        self.norm3 = nn.LayerNorm(dim)
-        self.use_checkpoint = use_checkpoint
-
-    def forward(self, x, context=None):
-
- 
-        if self.use_checkpoint:
-            return checkpoint(self._forward, x, context)
-        else:
-            return self._forward(x, context)
-
-    def _forward(self, x, context=None):
-        x = self.attn1(self.norm1(x)) + x
-        x = self.attn2(self.norm2(x), context=context) + x
-        x = self.ff(self.norm3(x)) + x
-        return x
-        
-
-
-class SpatialTransformer(nn.Module):
-    """
-    Transformer block for image-like data.
-    First, project the input (aka embedding)
-    and reshape to b, t, d.
-    Then apply standard transformer action.
-    Finally, reshape to image
-    """
-    def __init__(self, in_channels, n_heads, d_head,
-                 depth=1, dropout=0., context_dim=None, use_checkpoint=False):
-        super().__init__()
-        self.in_channels = in_channels
-        inner_dim = n_heads * d_head
-        self.norm = Normalize(in_channels)
-
-        self.proj_in = nn.Conv2d(in_channels,
-                                 inner_dim,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-
-        self.transformer_blocks = nn.ModuleList(
-            [BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim, use_checkpoint=use_checkpoint)
-                for d in range(depth)]
-        )
-
-        self.proj_out = zero_module(nn.Conv2d(inner_dim,
-                                              in_channels,
-                                              kernel_size=1,
-                                              stride=1,
-                                              padding=0))
-
-
-    def forward(self, x, context=None):
-        # note: if no context is given, cross-attention defaults to self-attention
-        b, c, h, w = x.shape
-        x_in = x
-        x = self.norm(x)
-        x = self.proj_in(x)
-        x = rearrange(x, 'b c h w -> b (h w) c')
-        x = x.contiguous()
-        for block in self.transformer_blocks:
-            x = block(x, context=context)
-        x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w)
-        x = x.contiguous()
-        x = self.proj_out(x)
-        return x + x_in
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/__init__.py b/examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/model.py b/examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/model.py
deleted file mode 100644
index 3c28492c5502..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/model.py
+++ /dev/null
@@ -1,862 +0,0 @@
-# pytorch_diffusion + derived encoder decoder
-import math
-import torch
-import torch.nn as nn
-import numpy as np
-from einops import rearrange
-
-from ldm.util import instantiate_from_config
-from ldm.modules.attention import LinearAttention
-
-
-def get_timestep_embedding(timesteps, embedding_dim):
-    """
-    This matches the implementation in Denoising Diffusion Probabilistic Models:
-    From Fairseq.
-    Build sinusoidal embeddings.
-    This matches the implementation in tensor2tensor, but differs slightly
-    from the description in Section 3.5 of "Attention Is All You Need".
-    """
-    assert len(timesteps.shape) == 1
-
-    half_dim = embedding_dim // 2
-    emb = math.log(10000) / (half_dim - 1)
-    emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
-    emb = emb.to(device=timesteps.device)
-    emb = timesteps.float()[:, None] * emb[None, :]
-    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
-    if embedding_dim % 2 == 1:  # zero pad
-        emb = torch.nn.functional.pad(emb, (0,1,0,0))
-    return emb
-
-
-def nonlinearity(x):
-    # swish
-    return x*torch.sigmoid(x)
-
-
-def Normalize(in_channels, num_groups=32):
-    return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
-
-
-class Upsample(nn.Module):
-    def __init__(self, in_channels, with_conv):
-        super().__init__()
-        self.with_conv = with_conv
-        if self.with_conv:
-            self.conv = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
-
-    def forward(self, x):
-        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
-        if self.with_conv:
-            x = self.conv(x)
-        return x
-
-
-class Downsample(nn.Module):
-    def __init__(self, in_channels, with_conv):
-        super().__init__()
-        self.with_conv = with_conv
-        if self.with_conv:
-            # no asymmetric padding in torch conv, must do it ourselves
-            self.conv = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=3,
-                                        stride=2,
-                                        padding=0)
-
-    def forward(self, x):
-        if self.with_conv:
-            pad = (0,1,0,1)
-            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
-            x = self.conv(x)
-        else:
-            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
-        return x
-
-
-class ResnetBlock(nn.Module):
-    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
-                 dropout, temb_channels=512):
-        super().__init__()
-        self.in_channels = in_channels
-        out_channels = in_channels if out_channels is None else out_channels
-        self.out_channels = out_channels
-        self.use_conv_shortcut = conv_shortcut
-
-        self.norm1 = Normalize(in_channels)
-        self.conv1 = torch.nn.Conv2d(in_channels,
-                                     out_channels,
-                                     kernel_size=3,
-                                     stride=1,
-                                     padding=1)
-        if temb_channels > 0:
-            self.temb_proj = torch.nn.Linear(temb_channels,
-                                             out_channels)
-        self.norm2 = Normalize(out_channels)
-        self.dropout = torch.nn.Dropout(dropout)
-        self.conv2 = torch.nn.Conv2d(out_channels,
-                                     out_channels,
-                                     kernel_size=3,
-                                     stride=1,
-                                     padding=1)
-        if self.in_channels != self.out_channels:
-            if self.use_conv_shortcut:
-                self.conv_shortcut = torch.nn.Conv2d(in_channels,
-                                                     out_channels,
-                                                     kernel_size=3,
-                                                     stride=1,
-                                                     padding=1)
-            else:
-                self.nin_shortcut = torch.nn.Conv2d(in_channels,
-                                                    out_channels,
-                                                    kernel_size=1,
-                                                    stride=1,
-                                                    padding=0)
-
-    def forward(self, x, temb):
-        h = x
-        h = self.norm1(h)
-        h = nonlinearity(h)
-        h = self.conv1(h)
-
-        if temb is not None:
-            h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
-
-        h = self.norm2(h)
-        h = nonlinearity(h)
-        h = self.dropout(h)
-        h = self.conv2(h)
-
-        if self.in_channels != self.out_channels:
-            if self.use_conv_shortcut:
-                x = self.conv_shortcut(x)
-            else:
-                x = self.nin_shortcut(x)
-
-        return x+h
-
-
-class LinAttnBlock(LinearAttention):
-    """to match AttnBlock usage"""
-    def __init__(self, in_channels):
-        super().__init__(dim=in_channels, heads=1, dim_head=in_channels)
-
-
-class AttnBlock(nn.Module):
-    def __init__(self, in_channels):
-        super().__init__()
-        self.in_channels = in_channels
-
-        self.norm = Normalize(in_channels)
-        self.q = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.k = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.v = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.proj_out = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=1,
-                                        stride=1,
-                                        padding=0)
-
-
-    def forward(self, x):
-        h_ = x
-        h_ = self.norm(h_)
-        q = self.q(h_)
-        k = self.k(h_)
-        v = self.v(h_)
-
-        # compute attention
-        b,c,h,w = q.shape
-        q = q.reshape(b,c,h*w)
-        q = q.permute(0,2,1)   # b,hw,c
-        k = k.reshape(b,c,h*w) # b,c,hw
-        w_ = torch.bmm(q,k)     # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
-        w_ = w_ * (int(c)**(-0.5))
-        w_ = torch.nn.functional.softmax(w_, dim=2)
-
-        # attend to values
-        v = v.reshape(b,c,h*w)
-        w_ = w_.permute(0,2,1)   # b,hw,hw (first hw of k, second of q)
-        h_ = torch.bmm(v,w_)     # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
-        h_ = h_.reshape(b,c,h,w)
-
-        h_ = self.proj_out(h_)
-
-        return x+h_
-
-
-def make_attn(in_channels, attn_type="vanilla"):
-    assert attn_type in ["vanilla", "linear", "none"], f'attn_type {attn_type} unknown'
-    print(f"making attention of type '{attn_type}' with {in_channels} in_channels")
-    if attn_type == "vanilla":
-        return AttnBlock(in_channels)
-    elif attn_type == "none":
-        return nn.Identity(in_channels)
-    else:
-        return LinAttnBlock(in_channels)
-
-class temb_module(nn.Module):
-    def __init__(self):
-        super().__init__()
-        pass
-
-class Model(nn.Module):
-    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
-                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
-                 resolution, use_timestep=True, use_linear_attn=False, attn_type="vanilla"):
-        super().__init__()
-        if use_linear_attn: attn_type = "linear"
-        self.ch = ch
-        self.temb_ch = self.ch*4
-        self.num_resolutions = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        self.resolution = resolution
-        self.in_channels = in_channels
-
-        self.use_timestep = use_timestep
-        if self.use_timestep:
-            # timestep embedding
-            # self.temb = nn.Module()
-            self.temb = temb_module()
-            self.temb.dense = nn.ModuleList([
-                torch.nn.Linear(self.ch,
-                                self.temb_ch),
-                torch.nn.Linear(self.temb_ch,
-                                self.temb_ch),
-            ])
-
-        # downsampling
-        self.conv_in = torch.nn.Conv2d(in_channels,
-                                       self.ch,
-                                       kernel_size=3,
-                                       stride=1,
-                                       padding=1)
-
-        curr_res = resolution
-        in_ch_mult = (1,)+tuple(ch_mult)
-        self.down = nn.ModuleList()
-        for i_level in range(self.num_resolutions):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_in = ch*in_ch_mult[i_level]
-            block_out = ch*ch_mult[i_level]
-            for i_block in range(self.num_res_blocks):
-                block.append(ResnetBlock(in_channels=block_in,
-                                         out_channels=block_out,
-                                         temb_channels=self.temb_ch,
-                                         dropout=dropout))
-                block_in = block_out
-                if curr_res in attn_resolutions:
-                    attn.append(make_attn(block_in, attn_type=attn_type))
-            # down = nn.Module()
-            down = Down_module()
-            down.block = block
-            down.attn = attn
-            if i_level != self.num_resolutions-1:
-                down.downsample = Downsample(block_in, resamp_with_conv)
-                curr_res = curr_res // 2
-            self.down.append(down)
-
-        # middle
-        # self.mid = nn.Module()
-        self.mid = Mid_module()
-        self.mid.block_1 = ResnetBlock(in_channels=block_in,
-                                       out_channels=block_in,
-                                       temb_channels=self.temb_ch,
-                                       dropout=dropout)
-        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
-        self.mid.block_2 = ResnetBlock(in_channels=block_in,
-                                       out_channels=block_in,
-                                       temb_channels=self.temb_ch,
-                                       dropout=dropout)
-
-        # upsampling
-        self.up = nn.ModuleList()
-        for i_level in reversed(range(self.num_resolutions)):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_out = ch*ch_mult[i_level]
-            skip_in = ch*ch_mult[i_level]
-            for i_block in range(self.num_res_blocks+1):
-                if i_block == self.num_res_blocks:
-                    skip_in = ch*in_ch_mult[i_level]
-                block.append(ResnetBlock(in_channels=block_in+skip_in,
-                                         out_channels=block_out,
-                                         temb_channels=self.temb_ch,
-                                         dropout=dropout))
-                block_in = block_out
-                if curr_res in attn_resolutions:
-                    attn.append(make_attn(block_in, attn_type=attn_type))
-            # up = nn.Module()
-            up = Up_module()
-            up.block = block
-            up.attn = attn
-            if i_level != 0:
-                up.upsample = Upsample(block_in, resamp_with_conv)
-                curr_res = curr_res * 2
-            self.up.insert(0, up) # prepend to get consistent order
-
-        # end
-        self.norm_out = Normalize(block_in)
-        self.conv_out = torch.nn.Conv2d(block_in,
-                                        out_ch,
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
-
-    def forward(self, x, t=None, context=None):
-        #assert x.shape[2] == x.shape[3] == self.resolution
-        if context is not None:
-            # assume aligned context, cat along channel axis
-            x = torch.cat((x, context), dim=1)
-        if self.use_timestep:
-            # timestep embedding
-            assert t is not None
-            temb = get_timestep_embedding(t, self.ch)
-            temb = self.temb.dense[0](temb)
-            temb = nonlinearity(temb)
-            temb = self.temb.dense[1](temb)
-        else:
-            temb = None
-
-        # downsampling
-        hs = [self.conv_in(x)]
-        for i_level in range(self.num_resolutions):
-            for i_block in range(self.num_res_blocks):
-                h = self.down[i_level].block[i_block](hs[-1], temb)
-                if len(self.down[i_level].attn) > 0:
-                    h = self.down[i_level].attn[i_block](h)
-                hs.append(h)
-            if i_level != self.num_resolutions-1:
-                hs.append(self.down[i_level].downsample(hs[-1]))
-
-        # middle
-        h = hs[-1]
-        h = self.mid.block_1(h, temb)
-        h = self.mid.attn_1(h)
-        h = self.mid.block_2(h, temb)
-
-        # upsampling
-        for i_level in reversed(range(self.num_resolutions)):
-            for i_block in range(self.num_res_blocks+1):
-                h = self.up[i_level].block[i_block](
-                    torch.cat([h, hs.pop()], dim=1), temb)
-                if len(self.up[i_level].attn) > 0:
-                    h = self.up[i_level].attn[i_block](h)
-            if i_level != 0:
-                h = self.up[i_level].upsample(h)
-
-        # end
-        h = self.norm_out(h)
-        h = nonlinearity(h)
-        h = self.conv_out(h)
-        return h
-
-    def get_last_layer(self):
-        return self.conv_out.weight
-
-class Down_module(nn.Module):
-    def __init__(self):
-        super().__init__()
-        pass
-
-class Up_module(nn.Module):
-    def __init__(self):
-        super().__init__()
-        pass
-
-class Mid_module(nn.Module):
-    def __init__(self):
-        super().__init__()
-        pass
-
-
-class Encoder(nn.Module):
-    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
-                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
-                 resolution, z_channels, double_z=True, use_linear_attn=False, attn_type="vanilla",
-                 **ignore_kwargs):
-        super().__init__()
-        if use_linear_attn: attn_type = "linear"
-        self.ch = ch
-        self.temb_ch = 0
-        self.num_resolutions = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        self.resolution = resolution
-        self.in_channels = in_channels
-
-        # downsampling
-        self.conv_in = torch.nn.Conv2d(in_channels,
-                                       self.ch,
-                                       kernel_size=3,
-                                       stride=1,
-                                       padding=1)
-
-        curr_res = resolution
-        in_ch_mult = (1,)+tuple(ch_mult)
-        self.in_ch_mult = in_ch_mult
-        self.down = nn.ModuleList()
-        for i_level in range(self.num_resolutions):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_in = ch*in_ch_mult[i_level]
-            block_out = ch*ch_mult[i_level]
-            for i_block in range(self.num_res_blocks):
-                block.append(ResnetBlock(in_channels=block_in,
-                                         out_channels=block_out,
-                                         temb_channels=self.temb_ch,
-                                         dropout=dropout))
-                block_in = block_out
-                if curr_res in attn_resolutions:
-                    attn.append(make_attn(block_in, attn_type=attn_type))
-            # down = nn.Module()
-            down = Down_module()
-            down.block = block
-            down.attn = attn
-            if i_level != self.num_resolutions-1:
-                down.downsample = Downsample(block_in, resamp_with_conv)
-                curr_res = curr_res // 2
-            self.down.append(down)
-
-        # middle
-        # self.mid = nn.Module()
-        self.mid = Mid_module()
-        self.mid.block_1 = ResnetBlock(in_channels=block_in,
-                                       out_channels=block_in,
-                                       temb_channels=self.temb_ch,
-                                       dropout=dropout)
-        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
-        self.mid.block_2 = ResnetBlock(in_channels=block_in,
-                                       out_channels=block_in,
-                                       temb_channels=self.temb_ch,
-                                       dropout=dropout)
-
-        # end
-        self.norm_out = Normalize(block_in)
-        self.conv_out = torch.nn.Conv2d(block_in,
-                                        2*z_channels if double_z else z_channels,
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
-
-    def forward(self, x):
-        # timestep embedding
-        temb = None
-
-        # downsampling
-        hs = [self.conv_in(x)]
-        for i_level in range(self.num_resolutions):
-            for i_block in range(self.num_res_blocks):
-                h = self.down[i_level].block[i_block](hs[-1], temb)
-                if len(self.down[i_level].attn) > 0:
-                    h = self.down[i_level].attn[i_block](h)
-                hs.append(h)
-            if i_level != self.num_resolutions-1:
-                hs.append(self.down[i_level].downsample(hs[-1]))
-
-        # middle
-        h = hs[-1]
-        h = self.mid.block_1(h, temb)
-        h = self.mid.attn_1(h)
-        h = self.mid.block_2(h, temb)
-
-        # end
-        h = self.norm_out(h)
-        h = nonlinearity(h)
-        h = self.conv_out(h)
-        return h
-
-
-class Decoder(nn.Module):
-    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
-                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
-                 resolution, z_channels, give_pre_end=False, tanh_out=False, use_linear_attn=False,
-                 attn_type="vanilla", **ignorekwargs):
-        super().__init__()
-        if use_linear_attn: attn_type = "linear"
-        self.ch = ch
-        self.temb_ch = 0
-        self.num_resolutions = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        self.resolution = resolution
-        self.in_channels = in_channels
-        self.give_pre_end = give_pre_end
-        self.tanh_out = tanh_out
-
-        # compute in_ch_mult, block_in and curr_res at lowest res
-        in_ch_mult = (1,)+tuple(ch_mult)
-        block_in = ch*ch_mult[self.num_resolutions-1]
-        curr_res = resolution // 2**(self.num_resolutions-1)
-        self.z_shape = (1,z_channels,curr_res,curr_res)
-        print("Working with z of shape {} = {} dimensions.".format(
-            self.z_shape, np.prod(self.z_shape)))
-
-        # z to block_in
-        self.conv_in = torch.nn.Conv2d(z_channels,
-                                       block_in,
-                                       kernel_size=3,
-                                       stride=1,
-                                       padding=1)
-
-        # middle
-        # self.mid = nn.Module()
-        self.mid = Mid_module()
-        self.mid.block_1 = ResnetBlock(in_channels=block_in,
-                                       out_channels=block_in,
-                                       temb_channels=self.temb_ch,
-                                       dropout=dropout)
-        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
-        self.mid.block_2 = ResnetBlock(in_channels=block_in,
-                                       out_channels=block_in,
-                                       temb_channels=self.temb_ch,
-                                       dropout=dropout)
-
-        # upsampling
-        self.up = nn.ModuleList()
-        for i_level in reversed(range(self.num_resolutions)):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_out = ch*ch_mult[i_level]
-            for i_block in range(self.num_res_blocks+1):
-                block.append(ResnetBlock(in_channels=block_in,
-                                         out_channels=block_out,
-                                         temb_channels=self.temb_ch,
-                                         dropout=dropout))
-                block_in = block_out
-                if curr_res in attn_resolutions:
-                    attn.append(make_attn(block_in, attn_type=attn_type))
-            # up = nn.Module()
-            up = Up_module()
-            up.block = block
-            up.attn = attn
-            if i_level != 0:
-                up.upsample = Upsample(block_in, resamp_with_conv)
-                curr_res = curr_res * 2
-            self.up.insert(0, up) # prepend to get consistent order
-
-        # end
-        self.norm_out = Normalize(block_in)
-        self.conv_out = torch.nn.Conv2d(block_in,
-                                        out_ch,
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
-
-    def forward(self, z):
-        #assert z.shape[1:] == self.z_shape[1:]
-        self.last_z_shape = z.shape
-
-        # timestep embedding
-        temb = None
-
-        # z to block_in
-        h = self.conv_in(z)
-
-        # middle
-        h = self.mid.block_1(h, temb)
-        h = self.mid.attn_1(h)
-        h = self.mid.block_2(h, temb)
-
-        # upsampling
-        for i_level in reversed(range(self.num_resolutions)):
-            for i_block in range(self.num_res_blocks+1):
-                h = self.up[i_level].block[i_block](h, temb)
-                if len(self.up[i_level].attn) > 0:
-                    h = self.up[i_level].attn[i_block](h)
-            if i_level != 0:
-                h = self.up[i_level].upsample(h)
-
-        # end
-        if self.give_pre_end:
-            return h
-
-        h = self.norm_out(h)
-        h = nonlinearity(h)
-        h = self.conv_out(h)
-        if self.tanh_out:
-            h = torch.tanh(h)
-        return h
-
-
-class SimpleDecoder(nn.Module):
-    def __init__(self, in_channels, out_channels, *args, **kwargs):
-        super().__init__()
-        self.model = nn.ModuleList([nn.Conv2d(in_channels, in_channels, 1),
-                                     ResnetBlock(in_channels=in_channels,
-                                                 out_channels=2 * in_channels,
-                                                 temb_channels=0, dropout=0.0),
-                                     ResnetBlock(in_channels=2 * in_channels,
-                                                out_channels=4 * in_channels,
-                                                temb_channels=0, dropout=0.0),
-                                     ResnetBlock(in_channels=4 * in_channels,
-                                                out_channels=2 * in_channels,
-                                                temb_channels=0, dropout=0.0),
-                                     nn.Conv2d(2*in_channels, in_channels, 1),
-                                     Upsample(in_channels, with_conv=True)])
-        # end
-        self.norm_out = Normalize(in_channels)
-        self.conv_out = torch.nn.Conv2d(in_channels,
-                                        out_channels,
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
-
-    def forward(self, x):
-        for i, layer in enumerate(self.model):
-            if i in [1,2,3]:
-                x = layer(x, None)
-            else:
-                x = layer(x)
-
-        h = self.norm_out(x)
-        h = nonlinearity(h)
-        x = self.conv_out(h)
-        return x
-
-
-class UpsampleDecoder(nn.Module):
-    def __init__(self, in_channels, out_channels, ch, num_res_blocks, resolution,
-                 ch_mult=(2,2), dropout=0.0):
-        super().__init__()
-        # upsampling
-        self.temb_ch = 0
-        self.num_resolutions = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        block_in = in_channels
-        curr_res = resolution // 2 ** (self.num_resolutions - 1)
-        self.res_blocks = nn.ModuleList()
-        self.upsample_blocks = nn.ModuleList()
-        for i_level in range(self.num_resolutions):
-            res_block = []
-            block_out = ch * ch_mult[i_level]
-            for i_block in range(self.num_res_blocks + 1):
-                res_block.append(ResnetBlock(in_channels=block_in,
-                                         out_channels=block_out,
-                                         temb_channels=self.temb_ch,
-                                         dropout=dropout))
-                block_in = block_out
-            self.res_blocks.append(nn.ModuleList(res_block))
-            if i_level != self.num_resolutions - 1:
-                self.upsample_blocks.append(Upsample(block_in, True))
-                curr_res = curr_res * 2
-
-        # end
-        self.norm_out = Normalize(block_in)
-        self.conv_out = torch.nn.Conv2d(block_in,
-                                        out_channels,
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
-
-    def forward(self, x):
-        # upsampling
-        h = x
-        for k, i_level in enumerate(range(self.num_resolutions)):
-            for i_block in range(self.num_res_blocks + 1):
-                h = self.res_blocks[i_level][i_block](h, None)
-            if i_level != self.num_resolutions - 1:
-                h = self.upsample_blocks[k](h)
-        h = self.norm_out(h)
-        h = nonlinearity(h)
-        h = self.conv_out(h)
-        return h
-
-
-class LatentRescaler(nn.Module):
-    def __init__(self, factor, in_channels, mid_channels, out_channels, depth=2):
-        super().__init__()
-        # residual block, interpolate, residual block
-        self.factor = factor
-        self.conv_in = nn.Conv2d(in_channels,
-                                 mid_channels,
-                                 kernel_size=3,
-                                 stride=1,
-                                 padding=1)
-        self.res_block1 = nn.ModuleList([ResnetBlock(in_channels=mid_channels,
-                                                     out_channels=mid_channels,
-                                                     temb_channels=0,
-                                                     dropout=0.0) for _ in range(depth)])
-        self.attn = AttnBlock(mid_channels)
-        self.res_block2 = nn.ModuleList([ResnetBlock(in_channels=mid_channels,
-                                                     out_channels=mid_channels,
-                                                     temb_channels=0,
-                                                     dropout=0.0) for _ in range(depth)])
-
-        self.conv_out = nn.Conv2d(mid_channels,
-                                  out_channels,
-                                  kernel_size=1,
-                                  )
-
-    def forward(self, x):
-        x = self.conv_in(x)
-        for block in self.res_block1:
-            x = block(x, None)
-        x = torch.nn.functional.interpolate(x, size=(int(round(x.shape[2]*self.factor)), int(round(x.shape[3]*self.factor))))
-        x = self.attn(x)
-        for block in self.res_block2:
-            x = block(x, None)
-        x = self.conv_out(x)
-        return x
-
-
-class MergedRescaleEncoder(nn.Module):
-    def __init__(self, in_channels, ch, resolution, out_ch, num_res_blocks,
-                 attn_resolutions, dropout=0.0, resamp_with_conv=True,
-                 ch_mult=(1,2,4,8), rescale_factor=1.0, rescale_module_depth=1):
-        super().__init__()
-        intermediate_chn = ch * ch_mult[-1]
-        self.encoder = Encoder(in_channels=in_channels, num_res_blocks=num_res_blocks, ch=ch, ch_mult=ch_mult,
-                               z_channels=intermediate_chn, double_z=False, resolution=resolution,
-                               attn_resolutions=attn_resolutions, dropout=dropout, resamp_with_conv=resamp_with_conv,
-                               out_ch=None)
-        self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=intermediate_chn,
-                                       mid_channels=intermediate_chn, out_channels=out_ch, depth=rescale_module_depth)
-
-    def forward(self, x):
-        x = self.encoder(x)
-        x = self.rescaler(x)
-        return x
-
-
-class MergedRescaleDecoder(nn.Module):
-    def __init__(self, z_channels, out_ch, resolution, num_res_blocks, attn_resolutions, ch, ch_mult=(1,2,4,8),
-                 dropout=0.0, resamp_with_conv=True, rescale_factor=1.0, rescale_module_depth=1):
-        super().__init__()
-        tmp_chn = z_channels*ch_mult[-1]
-        self.decoder = Decoder(out_ch=out_ch, z_channels=tmp_chn, attn_resolutions=attn_resolutions, dropout=dropout,
-                               resamp_with_conv=resamp_with_conv, in_channels=None, num_res_blocks=num_res_blocks,
-                               ch_mult=ch_mult, resolution=resolution, ch=ch)
-        self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=z_channels, mid_channels=tmp_chn,
-                                       out_channels=tmp_chn, depth=rescale_module_depth)
-
-    def forward(self, x):
-        x = self.rescaler(x)
-        x = self.decoder(x)
-        return x
-
-
-class Upsampler(nn.Module):
-    def __init__(self, in_size, out_size, in_channels, out_channels, ch_mult=2):
-        super().__init__()
-        assert out_size >= in_size
-        num_blocks = int(np.log2(out_size//in_size))+1
-        factor_up = 1.+ (out_size % in_size)
-        print(f"Building {self.__class__.__name__} with in_size: {in_size} --> out_size {out_size} and factor {factor_up}")
-        self.rescaler = LatentRescaler(factor=factor_up, in_channels=in_channels, mid_channels=2*in_channels,
-                                       out_channels=in_channels)
-        self.decoder = Decoder(out_ch=out_channels, resolution=out_size, z_channels=in_channels, num_res_blocks=2,
-                               attn_resolutions=[], in_channels=None, ch=in_channels,
-                               ch_mult=[ch_mult for _ in range(num_blocks)])
-
-    def forward(self, x):
-        x = self.rescaler(x)
-        x = self.decoder(x)
-        return x
-
-
-class Resize(nn.Module):
-    def __init__(self, in_channels=None, learned=False, mode="bilinear"):
-        super().__init__()
-        self.with_conv = learned
-        self.mode = mode
-        if self.with_conv:
-            print(f"Note: {self.__class__.__name} uses learned downsampling and will ignore the fixed {mode} mode")
-            raise NotImplementedError()
-            assert in_channels is not None
-            # no asymmetric padding in torch conv, must do it ourselves
-            self.conv = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=4,
-                                        stride=2,
-                                        padding=1)
-
-    def forward(self, x, scale_factor=1.0):
-        if scale_factor==1.0:
-            return x
-        else:
-            x = torch.nn.functional.interpolate(x, mode=self.mode, align_corners=False, scale_factor=scale_factor)
-        return x
-
-class FirstStagePostProcessor(nn.Module):
-
-    def __init__(self, ch_mult:list, in_channels,
-                 pretrained_model:nn.Module=None,
-                 reshape=False,
-                 n_channels=None,
-                 dropout=0.,
-                 pretrained_config=None):
-        super().__init__()
-        if pretrained_config is None:
-            assert pretrained_model is not None, 'Either "pretrained_model" or "pretrained_config" must not be None'
-            self.pretrained_model = pretrained_model
-        else:
-            assert pretrained_config is not None, 'Either "pretrained_model" or "pretrained_config" must not be None'
-            self.instantiate_pretrained(pretrained_config)
-
-        self.do_reshape = reshape
-
-        if n_channels is None:
-            n_channels = self.pretrained_model.encoder.ch
-
-        self.proj_norm = Normalize(in_channels,num_groups=in_channels//2)
-        self.proj = nn.Conv2d(in_channels,n_channels,kernel_size=3,
-                            stride=1,padding=1)
-
-        blocks = []
-        downs = []
-        ch_in = n_channels
-        for m in ch_mult:
-            blocks.append(ResnetBlock(in_channels=ch_in,out_channels=m*n_channels,dropout=dropout))
-            ch_in = m * n_channels
-            downs.append(Downsample(ch_in, with_conv=False))
-
-        self.model = nn.ModuleList(blocks)
-        self.downsampler = nn.ModuleList(downs)
-
-
-    def instantiate_pretrained(self, config):
-        model = instantiate_from_config(config)
-        self.pretrained_model = model.eval()
-        # self.pretrained_model.train = False
-        for param in self.pretrained_model.parameters():
-            param.requires_grad = False
-
-
-    @torch.no_grad()
-    def encode_with_pretrained(self,x):
-        c = self.pretrained_model.encode(x)
-        if isinstance(c, DiagonalGaussianDistribution):
-            c = c.mode()
-        return  c
-
-    def forward(self,x):
-        z_fs = self.encode_with_pretrained(x)
-        z = self.proj_norm(z_fs)
-        z = self.proj(z)
-        z = nonlinearity(z)
-
-        for submodel, downmodel in zip(self.model,self.downsampler):
-            z = submodel(z,temb=None)
-            z = downmodel(z)
-
-        if self.do_reshape:
-            z = rearrange(z,'b c h w -> b (h w) c')
-        return z
-
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/openaimodel.py b/examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/openaimodel.py
deleted file mode 100644
index 3aedc2205e13..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/openaimodel.py
+++ /dev/null
@@ -1,1152 +0,0 @@
-from abc import abstractmethod
-from functools import partial
-import math
-from typing import Iterable
-
-import numpy as np
-import torch
-import torch as th
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.utils import checkpoint
-
-from ldm.modules.diffusionmodules.util import (
-    conv_nd,
-    linear,
-    avg_pool_nd,
-    zero_module,
-    normalization,
-    timestep_embedding,
-)
-from ldm.modules.attention import SpatialTransformer
-
-
-# dummy replace
-def convert_module_to_f16(x):
-    # for n,p in x.named_parameter():
-    #     print(f"convert module {n} to_f16")
-    #     p.data = p.data.half()
-    pass
-
-def convert_module_to_f32(x):
-    pass
-
-
-## go
-class AttentionPool2d(nn.Module):
-    """
-    Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
-    """
-
-    def __init__(
-        self,
-        spacial_dim: int,
-        embed_dim: int,
-        num_heads_channels: int,
-        output_dim: int = None,
-    ):
-        super().__init__()
-        self.positional_embedding = nn.Parameter(th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5)
-        self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
-        self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
-        self.num_heads = embed_dim // num_heads_channels
-        self.attention = QKVAttention(self.num_heads)
-
-    def forward(self, x):
-        b, c, *_spatial = x.shape
-        x = x.reshape(b, c, -1)  # NC(HW)
-        x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1)  # NC(HW+1)
-        x = x + self.positional_embedding[None, :, :].to(x.dtype)  # NC(HW+1)
-        x = self.qkv_proj(x)
-        x = self.attention(x)
-        x = self.c_proj(x)
-        return x[:, :, 0]
-
-
-class TimestepBlock(nn.Module):
-    """
-    Any module where forward() takes timestep embeddings as a second argument.
-    """
-
-    @abstractmethod
-    def forward(self, x, emb):
-        """
-        Apply the module to `x` given `emb` timestep embeddings.
-        """
-
-
-class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
-    """
-    A sequential module that passes timestep embeddings to the children that
-    support it as an extra input.
-    """
-
-    def forward(self, x, emb, context=None):
-        for layer in self:
-            if isinstance(layer, TimestepBlock):
-                x = layer(x, emb)
-            elif isinstance(layer, SpatialTransformer):
-                x = layer(x, context)
-            else:
-                x = layer(x)
-        return x
-
-
-class Upsample(nn.Module):
-    """
-    An upsampling layer with an optional convolution.
-    :param channels: channels in the inputs and outputs.
-    :param use_conv: a bool determining if a convolution is applied.
-    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
-                 upsampling occurs in the inner-two dimensions.
-    """
-
-    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.dims = dims
-        if use_conv:
-            self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=padding)
-
-    def forward(self, x):
-        assert x.shape[1] == self.channels
-        if self.dims == 3:
-            x = F.interpolate(
-                x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
-            )
-        else:
-            x = F.interpolate(x, scale_factor=2, mode="nearest")
-        if self.use_conv:
-            x = self.conv(x)
-        return x
-
-class TransposedUpsample(nn.Module):
-    'Learned 2x upsampling without padding'
-    def __init__(self, channels, out_channels=None, ks=5):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-
-        self.up = nn.ConvTranspose2d(self.channels,self.out_channels,kernel_size=ks,stride=2)
-
-    def forward(self,x):
-        return self.up(x)
-
-
-class Downsample(nn.Module):
-    """
-    A downsampling layer with an optional convolution.
-    :param channels: channels in the inputs and outputs.
-    :param use_conv: a bool determining if a convolution is applied.
-    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
-                 downsampling occurs in the inner-two dimensions.
-    """
-
-    def __init__(self, channels, use_conv, dims=2, out_channels=None,padding=1):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.dims = dims
-        stride = 2 if dims != 3 else (1, 2, 2)
-        if use_conv:
-            self.op = conv_nd(
-                dims, self.channels, self.out_channels, 3, stride=stride, padding=padding
-            )
-        else:
-            assert self.channels == self.out_channels
-            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
-
-    def forward(self, x):
-        assert x.shape[1] == self.channels
-        return self.op(x)
-
-
-class ResBlock(TimestepBlock):
-    """
-    A residual block that can optionally change the number of channels.
-    :param channels: the number of input channels.
-    :param emb_channels: the number of timestep embedding channels.
-    :param dropout: the rate of dropout.
-    :param out_channels: if specified, the number of out channels.
-    :param use_conv: if True and out_channels is specified, use a spatial
-        convolution instead of a smaller 1x1 convolution to change the
-        channels in the skip connection.
-    :param dims: determines if the signal is 1D, 2D, or 3D.
-    :param use_checkpoint: if True, use gradient checkpointing on this module.
-    :param up: if True, use this block for upsampling.
-    :param down: if True, use this block for downsampling.
-    """
-
-    def __init__(
-        self,
-        channels,
-        emb_channels,
-        dropout,
-        out_channels=None,
-        use_conv=False,
-        use_scale_shift_norm=False,
-        dims=2,
-        use_checkpoint=False,
-        up=False,
-        down=False,
-    ):
-        super().__init__()
-        self.channels = channels
-        self.emb_channels = emb_channels
-        self.dropout = dropout
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.use_checkpoint = use_checkpoint
-        self.use_scale_shift_norm = use_scale_shift_norm
-
-        self.in_layers = nn.Sequential(
-            normalization(channels),
-            nn.SiLU(),
-            conv_nd(dims, channels, self.out_channels, 3, padding=1),
-        )
-
-        self.updown = up or down
-
-        if up:
-            self.h_upd = Upsample(channels, False, dims)
-            self.x_upd = Upsample(channels, False, dims)
-        elif down:
-            self.h_upd = Downsample(channels, False, dims)
-            self.x_upd = Downsample(channels, False, dims)
-        else:
-            self.h_upd = self.x_upd = nn.Identity()
-
-        self.emb_layers = nn.Sequential(
-            nn.SiLU(),
-            linear(
-                emb_channels,
-                2 * self.out_channels if use_scale_shift_norm else self.out_channels,
-            ),
-        )
-        self.out_layers = nn.Sequential(
-            normalization(self.out_channels),
-            nn.SiLU(),
-            nn.Dropout(p=dropout),
-            zero_module(
-                conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)
-            ),
-        )
-
-        if self.out_channels == channels:
-            self.skip_connection = nn.Identity()
-        elif use_conv:
-            self.skip_connection = conv_nd(
-                dims, channels, self.out_channels, 3, padding=1
-            )
-        else:
-            self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
-
-    def forward(self, x, emb):
-        """
-        Apply the block to a Tensor, conditioned on a timestep embedding.
-        :param x: an [N x C x ...] Tensor of features.
-        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
-        :return: an [N x C x ...] Tensor of outputs.
-        """
-        if self.use_checkpoint:
-            return checkpoint(self._forward, x, emb)
-        else:
-            return self._forward(x, emb)
-
-
-    def _forward(self, x, emb):
-        if self.updown:
-            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
-            h = in_rest(x)
-            h = self.h_upd(h)
-            x = self.x_upd(x)
-            h = in_conv(h)
-        else:
-            h = self.in_layers(x)
-        emb_out = self.emb_layers(emb).type(h.dtype)
-        while len(emb_out.shape) < len(h.shape):
-            emb_out = emb_out[..., None]
-        if self.use_scale_shift_norm:
-            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
-            scale, shift = th.chunk(emb_out, 2, dim=1)
-            h = out_norm(h) * (1 + scale) + shift
-            h = out_rest(h)
-        else:
-            h = h + emb_out
-            h = self.out_layers(h)
-        return self.skip_connection(x) + h
-
-
-class AttentionBlock(nn.Module):
-    """
-    An attention block that allows spatial positions to attend to each other.
-    Originally ported from here, but adapted to the N-d case.
-    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
-    """
-
-    def __init__(
-        self,
-        channels,
-        num_heads=1,
-        num_head_channels=-1,
-        use_checkpoint=False,
-        use_new_attention_order=False,
-    ):
-        super().__init__()
-        self.channels = channels
-        if num_head_channels == -1:
-            self.num_heads = num_heads
-        else:
-            assert (
-                channels % num_head_channels == 0
-            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
-            self.num_heads = channels // num_head_channels
-        self.use_checkpoint = use_checkpoint
-        self.norm = normalization(channels)
-        self.qkv = conv_nd(1, channels, channels * 3, 1)
-        if use_new_attention_order:
-            # split qkv before split heads
-            self.attention = QKVAttention(self.num_heads)
-        else:
-            # split heads before split qkv
-            self.attention = QKVAttentionLegacy(self.num_heads)
-
-        self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
-
-    def forward(self, x):
-        if self.use_checkpoint:
-            return checkpoint(self._forward, x)   # TODO: check checkpoint usage, is True # TODO: fix the .half call!!!
-        #return pt_checkpoint(self._forward, x)  # pytorch
-        else:
-            return self._forward(x)
-
-    def _forward(self, x):
-        b, c, *spatial = x.shape
-        x = x.reshape(b, c, -1)
-        qkv = self.qkv(self.norm(x))
-        h = self.attention(qkv)
-        h = self.proj_out(h)
-        return (x + h).reshape(b, c, *spatial)
-
-
-def count_flops_attn(model, _x, y):
-    """
-    A counter for the `thop` package to count the operations in an
-    attention operation.
-    Meant to be used like:
-        macs, params = thop.profile(
-            model,
-            inputs=(inputs, timestamps),
-            custom_ops={QKVAttention: QKVAttention.count_flops},
-        )
-    """
-    b, c, *spatial = y[0].shape
-    num_spatial = int(np.prod(spatial))
-    # We perform two matmuls with the same number of ops.
-    # The first computes the weight matrix, the second computes
-    # the combination of the value vectors.
-    matmul_ops = 2 * b * (num_spatial ** 2) * c
-    model.total_ops += th.DoubleTensor([matmul_ops])
-
-
-class QKVAttentionLegacy(nn.Module):
-    """
-    A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
-    """
-
-    def __init__(self, n_heads):
-        super().__init__()
-        self.n_heads = n_heads
-
-    def forward(self, qkv):
-        """
-        Apply QKV attention.
-        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
-        :return: an [N x (H * C) x T] tensor after attention.
-        """
-        bs, width, length = qkv.shape
-        assert width % (3 * self.n_heads) == 0
-        ch = width // (3 * self.n_heads)
-        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
-        scale = 1 / math.sqrt(math.sqrt(ch))
-        weight = th.einsum(
-            "bct,bcs->bts", q * scale, k * scale
-        )  # More stable with f16 than dividing afterwards
-        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
-        a = th.einsum("bts,bcs->bct", weight, v)
-        return a.reshape(bs, -1, length)
-
-    @staticmethod
-    def count_flops(model, _x, y):
-        return count_flops_attn(model, _x, y)
-
-
-class QKVAttention(nn.Module):
-    """
-    A module which performs QKV attention and splits in a different order.
-    """
-
-    def __init__(self, n_heads):
-        super().__init__()
-        self.n_heads = n_heads
-
-    def forward(self, qkv):
-        """
-        Apply QKV attention.
-        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
-        :return: an [N x (H * C) x T] tensor after attention.
-        """
-        bs, width, length = qkv.shape
-        assert width % (3 * self.n_heads) == 0
-        ch = width // (3 * self.n_heads)
-        q, k, v = qkv.chunk(3, dim=1)
-        scale = 1 / math.sqrt(math.sqrt(ch))
-        weight = th.einsum(
-            "bct,bcs->bts",
-            (q * scale).view(bs * self.n_heads, ch, length),
-            (k * scale).view(bs * self.n_heads, ch, length),
-        )  # More stable with f16 than dividing afterwards
-        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
-        a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
-        return a.reshape(bs, -1, length)
-
-    @staticmethod
-    def count_flops(model, _x, y):
-        return count_flops_attn(model, _x, y)
-
-
-class UNetModel(nn.Module):
-    """
-    The full UNet model with attention and timestep embedding.
-    :param in_channels: channels in the input Tensor.
-    :param model_channels: base channel count for the model.
-    :param out_channels: channels in the output Tensor.
-    :param num_res_blocks: number of residual blocks per downsample.
-    :param attention_resolutions: a collection of downsample rates at which
-        attention will take place. May be a set, list, or tuple.
-        For example, if this contains 4, then at 4x downsampling, attention
-        will be used.
-    :param dropout: the dropout probability.
-    :param channel_mult: channel multiplier for each level of the UNet.
-    :param conv_resample: if True, use learned convolutions for upsampling and
-        downsampling.
-    :param dims: determines if the signal is 1D, 2D, or 3D.
-    :param num_classes: if specified (as an int), then this model will be
-        class-conditional with `num_classes` classes.
-    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
-    :param num_heads: the number of attention heads in each attention layer.
-    :param num_heads_channels: if specified, ignore num_heads and instead use
-                               a fixed channel width per attention head.
-    :param num_heads_upsample: works with num_heads to set a different number
-                               of heads for upsampling. Deprecated.
-    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
-    :param resblock_updown: use residual blocks for up/downsampling.
-    :param use_new_attention_order: use a different attention pattern for potentially
-                                    increased efficiency.
-    """
-
-    def __init__(
-        self,
-        image_size,
-        in_channels,
-        model_channels,
-        out_channels,
-        num_res_blocks,
-        attention_resolutions,
-        dropout=0,
-        channel_mult=(1, 2, 4, 8),
-        conv_resample=True,
-        dims=2,
-        num_classes=None,
-        use_checkpoint=False,
-        use_fp16=False,
-        num_heads=-1,
-        num_head_channels=-1,
-        num_heads_upsample=-1,
-        use_scale_shift_norm=False,
-        resblock_updown=False,
-        use_new_attention_order=False,
-        use_spatial_transformer=False,    # custom transformer support
-        transformer_depth=1,              # custom transformer support
-        context_dim=None,                 # custom transformer support
-        n_embed=None,                     # custom support for prediction of discrete ids into codebook of first stage vq model
-        legacy=True,
-        from_pretrained: str=None
-    ):
-        super().__init__()
-        if use_spatial_transformer:
-            assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
-
-        if context_dim is not None:
-            assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
-            from omegaconf.listconfig import ListConfig
-            if type(context_dim) == ListConfig:
-                context_dim = list(context_dim)
-
-        if num_heads_upsample == -1:
-            num_heads_upsample = num_heads
-
-        if num_heads == -1:
-            assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
-
-        if num_head_channels == -1:
-            assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
-
-        self.image_size = image_size
-        self.in_channels = in_channels
-        self.model_channels = model_channels
-        self.out_channels = out_channels
-        self.num_res_blocks = num_res_blocks
-        self.attention_resolutions = attention_resolutions
-        self.dropout = dropout
-        self.channel_mult = channel_mult
-        self.conv_resample = conv_resample
-        self.num_classes = num_classes
-        self.use_checkpoint = use_checkpoint
-        self.dtype = th.float16 if use_fp16 else th.float32
-        self.num_heads = num_heads
-        self.num_head_channels = num_head_channels
-        self.num_heads_upsample = num_heads_upsample
-        self.predict_codebook_ids = n_embed is not None
-
-        time_embed_dim = model_channels * 4
-        self.time_embed = nn.Sequential(
-            linear(model_channels, time_embed_dim),
-            nn.SiLU(),
-            linear(time_embed_dim, time_embed_dim),
-        )
-
-        if self.num_classes is not None:
-            self.label_emb = nn.Embedding(num_classes, time_embed_dim)
-
-        self.input_blocks = nn.ModuleList(
-            [
-                TimestepEmbedSequential(
-                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
-                )
-            ]
-        )
-        self._feature_size = model_channels
-        input_block_chans = [model_channels]
-        ch = model_channels
-        ds = 1
-        for level, mult in enumerate(channel_mult):
-            for _ in range(num_res_blocks):
-                layers = [
-                    ResBlock(
-                        ch,
-                        time_embed_dim,
-                        dropout,
-                        out_channels=mult * model_channels,
-                        dims=dims,
-                        use_checkpoint=use_checkpoint,
-                        use_scale_shift_norm=use_scale_shift_norm,
-                    )
-                ]
-                ch = mult * model_channels
-                if ds in attention_resolutions:
-                    if num_head_channels == -1:
-                        dim_head = ch // num_heads
-                    else:
-                        num_heads = ch // num_head_channels
-                        dim_head = num_head_channels
-                    if legacy:
-                        #num_heads = 1
-                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
-                    layers.append(
-                        AttentionBlock(
-                            ch,
-                            use_checkpoint=use_checkpoint,
-                            num_heads=num_heads,
-                            num_head_channels=dim_head,
-                            use_new_attention_order=use_new_attention_order,
-                        ) if not use_spatial_transformer else SpatialTransformer(
-                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim, use_checkpoint=use_checkpoint,
-                        )
-                    )
-                self.input_blocks.append(TimestepEmbedSequential(*layers))
-                self._feature_size += ch
-                input_block_chans.append(ch)
-            if level != len(channel_mult) - 1:
-                out_ch = ch
-                self.input_blocks.append(
-                    TimestepEmbedSequential(
-                        ResBlock(
-                            ch,
-                            time_embed_dim,
-                            dropout,
-                            out_channels=out_ch,
-                            dims=dims,
-                            use_checkpoint=use_checkpoint,
-                            use_scale_shift_norm=use_scale_shift_norm,
-                            down=True,
-                        )
-                        if resblock_updown
-                        else Downsample(
-                            ch, conv_resample, dims=dims, out_channels=out_ch
-                        )
-                    )
-                )
-                ch = out_ch
-                input_block_chans.append(ch)
-                ds *= 2
-                self._feature_size += ch
-
-        if num_head_channels == -1:
-            dim_head = ch // num_heads
-        else:
-            num_heads = ch // num_head_channels
-            dim_head = num_head_channels
-        if legacy:
-            #num_heads = 1
-            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
-        self.middle_block = TimestepEmbedSequential(
-            ResBlock(
-                ch,
-                time_embed_dim,
-                dropout,
-                dims=dims,
-                use_checkpoint=use_checkpoint,
-                use_scale_shift_norm=use_scale_shift_norm,
-            ),
-            AttentionBlock(
-                ch,
-                use_checkpoint=use_checkpoint,
-                num_heads=num_heads,
-                num_head_channels=dim_head,
-                use_new_attention_order=use_new_attention_order,
-            ) if not use_spatial_transformer else SpatialTransformer(
-                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
-                        ),
-            ResBlock(
-                ch,
-                time_embed_dim,
-                dropout,
-                dims=dims,
-                use_checkpoint=use_checkpoint,
-                use_scale_shift_norm=use_scale_shift_norm,
-            ),
-        )
-        self._feature_size += ch
-
-        self.output_blocks = nn.ModuleList([])
-        for level, mult in list(enumerate(channel_mult))[::-1]:
-            for i in range(num_res_blocks + 1):
-                ich = input_block_chans.pop()
-                layers = [
-                    ResBlock(
-                        ch + ich,
-                        time_embed_dim,
-                        dropout,
-                        out_channels=model_channels * mult,
-                        dims=dims,
-                        use_checkpoint=use_checkpoint,
-                        use_scale_shift_norm=use_scale_shift_norm,
-                    )
-                ]
-                ch = model_channels * mult
-                if ds in attention_resolutions:
-                    if num_head_channels == -1:
-                        dim_head = ch // num_heads
-                    else:
-                        num_heads = ch // num_head_channels
-                        dim_head = num_head_channels
-                    if legacy:
-                        #num_heads = 1
-                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
-                    layers.append(
-                        AttentionBlock(
-                            ch,
-                            use_checkpoint=use_checkpoint,
-                            num_heads=num_heads_upsample,
-                            num_head_channels=dim_head,
-                            use_new_attention_order=use_new_attention_order,
-                        ) if not use_spatial_transformer else SpatialTransformer(
-                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
-                        )
-                    )
-                if level and i == num_res_blocks:
-                    out_ch = ch
-                    layers.append(
-                        ResBlock(
-                            ch,
-                            time_embed_dim,
-                            dropout,
-                            out_channels=out_ch,
-                            dims=dims,
-                            use_checkpoint=use_checkpoint,
-                            use_scale_shift_norm=use_scale_shift_norm,
-                            up=True,
-                        )
-                        if resblock_updown
-                        else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
-                    )
-                    ds //= 2
-                self.output_blocks.append(TimestepEmbedSequential(*layers))
-                self._feature_size += ch
-
-        self.out = nn.Sequential(
-            normalization(ch),
-            nn.SiLU(),
-            zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
-        )
-        if self.predict_codebook_ids:
-            self.id_predictor = nn.Sequential(
-            normalization(ch),
-            conv_nd(dims, model_channels, n_embed, 1),
-            #nn.LogSoftmax(dim=1)  # change to cross_entropy and produce non-normalized logits
-        )
-        # if use_fp16:
-            # self.convert_to_fp16()
-        from diffusers.modeling_utils import load_state_dict
-        if from_pretrained is not None:
-            state_dict = load_state_dict(from_pretrained)
-            self._load_pretrained_model(state_dict)
-
-    def _input_blocks_mapping(self, input_dict):
-        res_dict = {}
-        for key_, value_ in input_dict.items():
-            id_0 = int(key_[13])
-            if "resnets" in key_:
-                id_1 = int(key_[23])
-                target_id = 3 * id_0 + 1 + id_1
-                post_fix = key_[25:].replace('time_emb_proj', 'emb_layers.1')\
-                    .replace('norm1', 'in_layers.0')\
-                    .replace('norm2', 'out_layers.0')\
-                    .replace('conv1', 'in_layers.2')\
-                    .replace('conv2', 'out_layers.3')\
-                    .replace('conv_shortcut', 'skip_connection')
-                res_dict["input_blocks." + str(target_id) + '.0.' + post_fix] = value_
-            elif "attentions" in key_:
-                id_1 = int(key_[26])
-                target_id = 3 * id_0 + 1 + id_1
-                post_fix = key_[28:]
-                res_dict["input_blocks." + str(target_id) + '.1.' + post_fix] = value_
-            elif "downsamplers" in key_:
-                post_fix = key_[35:]
-                target_id = 3 * (id_0 + 1)
-                res_dict["input_blocks." + str(target_id) + '.0.op.' + post_fix] = value_
-        return res_dict
-
-
-    def _mid_blocks_mapping(self, mid_dict):
-        res_dict = {}
-        for key_, value_ in mid_dict.items():
-            if "resnets" in key_:
-                temp_key_ =key_.replace('time_emb_proj', 'emb_layers.1') \
-                    .replace('norm1', 'in_layers.0') \
-                    .replace('norm2', 'out_layers.0') \
-                    .replace('conv1', 'in_layers.2') \
-                    .replace('conv2', 'out_layers.3') \
-                    .replace('conv_shortcut', 'skip_connection')\
-                    .replace('middle_block.resnets.0', 'middle_block.0')\
-                    .replace('middle_block.resnets.1', 'middle_block.2')
-                res_dict[temp_key_] = value_
-            elif "attentions" in key_:
-                res_dict[key_.replace('attentions.0', '1')] = value_
-        return res_dict
-
-    def _other_blocks_mapping(self, other_dict):
-        res_dict = {}
-        for key_, value_ in other_dict.items():
-            tmp_key = key_.replace('conv_in', 'input_blocks.0.0')\
-                            .replace('time_embedding.linear_1', 'time_embed.0')\
-                            .replace('time_embedding.linear_2', 'time_embed.2')\
-                            .replace('conv_norm_out', 'out.0')\
-                            .replace('conv_out', 'out.2')
-            res_dict[tmp_key] = value_
-        return res_dict
-
-
-    def _output_blocks_mapping(self, output_dict):
-        res_dict = {}
-        for key_, value_ in output_dict.items():
-            id_0 = int(key_[14])
-            if "resnets" in key_:
-                id_1 = int(key_[24])
-                target_id = 3 * id_0 + id_1
-                post_fix = key_[26:].replace('time_emb_proj', 'emb_layers.1') \
-                    .replace('norm1', 'in_layers.0') \
-                    .replace('norm2', 'out_layers.0') \
-                    .replace('conv1', 'in_layers.2') \
-                    .replace('conv2', 'out_layers.3') \
-                    .replace('conv_shortcut', 'skip_connection')
-                res_dict["output_blocks." + str(target_id) + '.0.' + post_fix] = value_
-            elif "attentions" in key_:
-                id_1 = int(key_[27])
-                target_id = 3 * id_0 + id_1
-                post_fix = key_[29:]
-                res_dict["output_blocks." + str(target_id) + '.1.' + post_fix] = value_
-            elif "upsamplers" in key_:
-                post_fix = key_[34:]
-                target_id = 3 * (id_0 + 1) - 1
-                mid_str = '.2.conv.' if target_id != 2 else '.1.conv.'
-                res_dict["output_blocks." + str(target_id) + mid_str + post_fix] = value_
-        return res_dict
-
-    def _state_key_mapping(self, state_dict: dict):
-        import re
-        res_dict = {}
-        input_dict = {}
-        mid_dict = {}
-        output_dict = {}
-        other_dict = {}
-        for key_, value_ in state_dict.items():
-            if "down_blocks" in key_:
-                input_dict[key_.replace('down_blocks', 'input_blocks')] = value_
-            elif "up_blocks" in key_:
-                output_dict[key_.replace('up_blocks', 'output_blocks')] = value_
-            elif "mid_block" in key_:
-                mid_dict[key_.replace('mid_block', 'middle_block')] = value_
-            else:
-                other_dict[key_] = value_
-
-        input_dict = self._input_blocks_mapping(input_dict)
-        output_dict = self._output_blocks_mapping(output_dict)
-        mid_dict = self._mid_blocks_mapping(mid_dict)
-        other_dict = self._other_blocks_mapping(other_dict)
-        # key_list = state_dict.keys()
-        # key_str = " ".join(key_list)
-
-        # for key_, val_ in state_dict.items():
-        #     key_ = key_.replace("down_blocks", "input_blocks")\
-        #         .replace("up_blocks", 'output_blocks')
-        #     res_dict[key_] = val_
-        res_dict.update(input_dict)
-        res_dict.update(output_dict)
-        res_dict.update(mid_dict)
-        res_dict.update(other_dict)
-
-        return res_dict
-
-    def _load_pretrained_model(self, state_dict, ignore_mismatched_sizes=False):
-        state_dict = self._state_key_mapping(state_dict)
-        model_state_dict = self.state_dict()
-        loaded_keys = [k for k in state_dict.keys()]
-        expected_keys = list(model_state_dict.keys())
-        original_loaded_keys = loaded_keys
-        missing_keys = list(set(expected_keys) - set(loaded_keys))
-        unexpected_keys = list(set(loaded_keys) - set(expected_keys))
-
-        def _find_mismatched_keys(
-            state_dict,
-            model_state_dict,
-            loaded_keys,
-            ignore_mismatched_sizes,
-        ):
-            mismatched_keys = []
-            if ignore_mismatched_sizes:
-                for checkpoint_key in loaded_keys:
-                    model_key = checkpoint_key
-
-                    if (
-                        model_key in model_state_dict
-                        and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape
-                    ):
-                        mismatched_keys.append(
-                            (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
-                        )
-                        del state_dict[checkpoint_key]
-            return mismatched_keys
-        if state_dict is not None:
-            # Whole checkpoint
-            mismatched_keys = _find_mismatched_keys(
-                state_dict,
-                model_state_dict,
-                original_loaded_keys,
-                ignore_mismatched_sizes,
-            )
-            error_msgs = self._load_state_dict_into_model(state_dict)
-        return missing_keys, unexpected_keys, mismatched_keys, error_msgs
-
-    def _load_state_dict_into_model(self, state_dict):
-        # Convert old format to new format if needed from a PyTorch state_dict
-        # copy state_dict so _load_from_state_dict can modify it
-        state_dict = state_dict.copy()
-        error_msgs = []
-
-        # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
-        # so we need to apply the function recursively.
-        def load(module: torch.nn.Module, prefix=""):
-            args = (state_dict, prefix, {}, True, [], [], error_msgs)
-            module._load_from_state_dict(*args)
-
-            for name, child in module._modules.items():
-                if child is not None:
-                    load(child, prefix + name + ".")
-
-        load(self)
-
-        return error_msgs
-
-    def convert_to_fp16(self):
-        """
-        Convert the torso of the model to float16.
-        """
-        self.input_blocks.apply(convert_module_to_f16)
-        self.middle_block.apply(convert_module_to_f16)
-        self.output_blocks.apply(convert_module_to_f16)
-
-    def convert_to_fp32(self):
-        """
-        Convert the torso of the model to float32.
-        """
-        self.input_blocks.apply(convert_module_to_f32)
-        self.middle_block.apply(convert_module_to_f32)
-        self.output_blocks.apply(convert_module_to_f32)
-
-    def forward(self, x, timesteps=None, context=None, y=None,**kwargs):
-        """
-        Apply the model to an input batch.
-        :param x: an [N x C x ...] Tensor of inputs.
-        :param timesteps: a 1-D batch of timesteps.
-        :param context: conditioning plugged in via crossattn
-        :param y: an [N] Tensor of labels, if class-conditional.
-        :return: an [N x C x ...] Tensor of outputs.
-        """
-        assert (y is not None) == (
-            self.num_classes is not None
-        ), "must specify y if and only if the model is class-conditional"
-        hs = []
-        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
-        emb = self.time_embed(t_emb)
-
-        if self.num_classes is not None:
-            assert y.shape == (x.shape[0],)
-            emb = emb + self.label_emb(y)
-
-        h = x.type(self.dtype)
-        for module in self.input_blocks:
-            h = module(h, emb, context)
-            hs.append(h)
-        h = self.middle_block(h, emb, context)
-        for module in self.output_blocks:
-            h = th.cat([h, hs.pop()], dim=1)
-            h = module(h, emb, context)
-        h = h.type(self.dtype)
-        if self.predict_codebook_ids:
-            return self.id_predictor(h)
-        else:
-            return self.out(h)
-
-
-class EncoderUNetModel(nn.Module):
-    """
-    The half UNet model with attention and timestep embedding.
-    For usage, see UNet.
-    """
-
-    def __init__(
-        self,
-        image_size,
-        in_channels,
-        model_channels,
-        out_channels,
-        num_res_blocks,
-        attention_resolutions,
-        dropout=0,
-        channel_mult=(1, 2, 4, 8),
-        conv_resample=True,
-        dims=2,
-        use_checkpoint=False,
-        use_fp16=False,
-        num_heads=1,
-        num_head_channels=-1,
-        num_heads_upsample=-1,
-        use_scale_shift_norm=False,
-        resblock_updown=False,
-        use_new_attention_order=False,
-        pool="adaptive",
-        *args,
-        **kwargs
-    ):
-        super().__init__()
-
-        if num_heads_upsample == -1:
-            num_heads_upsample = num_heads
-
-        self.in_channels = in_channels
-        self.model_channels = model_channels
-        self.out_channels = out_channels
-        self.num_res_blocks = num_res_blocks
-        self.attention_resolutions = attention_resolutions
-        self.dropout = dropout
-        self.channel_mult = channel_mult
-        self.conv_resample = conv_resample
-        self.use_checkpoint = use_checkpoint
-        self.dtype = th.float16 if use_fp16 else th.float32
-        self.num_heads = num_heads
-        self.num_head_channels = num_head_channels
-        self.num_heads_upsample = num_heads_upsample
-
-        time_embed_dim = model_channels * 4
-        self.time_embed = nn.Sequential(
-            linear(model_channels, time_embed_dim),
-            nn.SiLU(),
-            linear(time_embed_dim, time_embed_dim),
-        )
-
-        self.input_blocks = nn.ModuleList(
-            [
-                TimestepEmbedSequential(
-                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
-                )
-            ]
-        )
-        self._feature_size = model_channels
-        input_block_chans = [model_channels]
-        ch = model_channels
-        ds = 1
-        for level, mult in enumerate(channel_mult):
-            for _ in range(num_res_blocks):
-                layers = [
-                    ResBlock(
-                        ch,
-                        time_embed_dim,
-                        dropout,
-                        out_channels=mult * model_channels,
-                        dims=dims,
-                        use_checkpoint=use_checkpoint,
-                        use_scale_shift_norm=use_scale_shift_norm,
-                    )
-                ]
-                ch = mult * model_channels
-                if ds in attention_resolutions:
-                    layers.append(
-                        AttentionBlock(
-                            ch,
-                            use_checkpoint=use_checkpoint,
-                            num_heads=num_heads,
-                            num_head_channels=num_head_channels,
-                            use_new_attention_order=use_new_attention_order,
-                        )
-                    )
-                self.input_blocks.append(TimestepEmbedSequential(*layers))
-                self._feature_size += ch
-                input_block_chans.append(ch)
-            if level != len(channel_mult) - 1:
-                out_ch = ch
-                self.input_blocks.append(
-                    TimestepEmbedSequential(
-                        ResBlock(
-                            ch,
-                            time_embed_dim,
-                            dropout,
-                            out_channels=out_ch,
-                            dims=dims,
-                            use_checkpoint=use_checkpoint,
-                            use_scale_shift_norm=use_scale_shift_norm,
-                            down=True,
-                        )
-                        if resblock_updown
-                        else Downsample(
-                            ch, conv_resample, dims=dims, out_channels=out_ch
-                        )
-                    )
-                )
-                ch = out_ch
-                input_block_chans.append(ch)
-                ds *= 2
-                self._feature_size += ch
-
-        self.middle_block = TimestepEmbedSequential(
-            ResBlock(
-                ch,
-                time_embed_dim,
-                dropout,
-                dims=dims,
-                use_checkpoint=use_checkpoint,
-                use_scale_shift_norm=use_scale_shift_norm,
-            ),
-            AttentionBlock(
-                ch,
-                use_checkpoint=use_checkpoint,
-                num_heads=num_heads,
-                num_head_channels=num_head_channels,
-                use_new_attention_order=use_new_attention_order,
-            ),
-            ResBlock(
-                ch,
-                time_embed_dim,
-                dropout,
-                dims=dims,
-                use_checkpoint=use_checkpoint,
-                use_scale_shift_norm=use_scale_shift_norm,
-            ),
-        )
-        self._feature_size += ch
-        self.pool = pool
-        if pool == "adaptive":
-            self.out = nn.Sequential(
-                normalization(ch),
-                nn.SiLU(),
-                nn.AdaptiveAvgPool2d((1, 1)),
-                zero_module(conv_nd(dims, ch, out_channels, 1)),
-                nn.Flatten(),
-            )
-        elif pool == "attention":
-            assert num_head_channels != -1
-            self.out = nn.Sequential(
-                normalization(ch),
-                nn.SiLU(),
-                AttentionPool2d(
-                    (image_size // ds), ch, num_head_channels, out_channels
-                ),
-            )
-        elif pool == "spatial":
-            self.out = nn.Sequential(
-                nn.Linear(self._feature_size, 2048),
-                nn.ReLU(),
-                nn.Linear(2048, self.out_channels),
-            )
-        elif pool == "spatial_v2":
-            self.out = nn.Sequential(
-                nn.Linear(self._feature_size, 2048),
-                normalization(2048),
-                nn.SiLU(),
-                nn.Linear(2048, self.out_channels),
-            )
-        else:
-            raise NotImplementedError(f"Unexpected {pool} pooling")
-
-    def convert_to_fp16(self):
-        """
-        Convert the torso of the model to float16.
-        """
-        self.input_blocks.apply(convert_module_to_f16)
-        self.middle_block.apply(convert_module_to_f16)
-
-    def convert_to_fp32(self):
-        """
-        Convert the torso of the model to float32.
-        """
-        self.input_blocks.apply(convert_module_to_f32)
-        self.middle_block.apply(convert_module_to_f32)
-
-    def forward(self, x, timesteps):
-        """
-        Apply the model to an input batch.
-        :param x: an [N x C x ...] Tensor of inputs.
-        :param timesteps: a 1-D batch of timesteps.
-        :return: an [N x K] Tensor of outputs.
-        """
-        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
-
-        results = []
-        h = x.type(self.dtype)
-        for module in self.input_blocks:
-            h = module(h, emb)
-            if self.pool.startswith("spatial"):
-                results.append(h.type(x.dtype).mean(dim=(2, 3)))
-        h = self.middle_block(h, emb)
-        if self.pool.startswith("spatial"):
-            results.append(h.type(x.dtype).mean(dim=(2, 3)))
-            h = th.cat(results, axis=-1)
-            return self.out(h)
-        else:
-            h = h.type(self.dtype)
-            return self.out(h)
-
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/util.py b/examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/util.py
deleted file mode 100644
index a7db9369c58a..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/diffusionmodules/util.py
+++ /dev/null
@@ -1,276 +0,0 @@
-# adopted from
-# https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
-# and
-# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
-# and
-# https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
-#
-# thanks!
-
-
-import os
-import math
-import torch
-import torch.nn as nn
-import numpy as np
-from einops import repeat
-
-from ldm.util import instantiate_from_config
-
-
-def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
-    if schedule == "linear":
-        betas = (
-                torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2
-        )
-
-    elif schedule == "cosine":
-        timesteps = (
-                torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
-        )
-        alphas = timesteps / (1 + cosine_s) * np.pi / 2
-        alphas = torch.cos(alphas).pow(2)
-        alphas = alphas / alphas[0]
-        betas = 1 - alphas[1:] / alphas[:-1]
-        betas = np.clip(betas, a_min=0, a_max=0.999)
-
-    elif schedule == "sqrt_linear":
-        betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
-    elif schedule == "sqrt":
-        betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) ** 0.5
-    else:
-        raise ValueError(f"schedule '{schedule}' unknown.")
-    return betas.numpy()
-
-
-def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True):
-    if ddim_discr_method == 'uniform':
-        c = num_ddpm_timesteps // num_ddim_timesteps
-        ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
-    elif ddim_discr_method == 'quad':
-        ddim_timesteps = ((np.linspace(0, np.sqrt(num_ddpm_timesteps * .8), num_ddim_timesteps)) ** 2).astype(int)
-    else:
-        raise NotImplementedError(f'There is no ddim discretization method called "{ddim_discr_method}"')
-
-    # assert ddim_timesteps.shape[0] == num_ddim_timesteps
-    # add one to get the final alpha values right (the ones from first scale to data during sampling)
-    steps_out = ddim_timesteps + 1
-    if verbose:
-        print(f'Selected timesteps for ddim sampler: {steps_out}')
-    return steps_out
-
-
-def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
-    # select alphas for computing the variance schedule
-    alphas = alphacums[ddim_timesteps]
-    alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
-
-    # according the the formula provided in https://arxiv.org/abs/2010.02502
-    sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
-    if verbose:
-        print(f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}')
-        print(f'For the chosen value of eta, which is {eta}, '
-              f'this results in the following sigma_t schedule for ddim sampler {sigmas}')
-    return sigmas, alphas, alphas_prev
-
-
-def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function,
-    which defines the cumulative product of (1-beta) over time from t = [0,1].
-    :param num_diffusion_timesteps: the number of betas to produce.
-    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
-                      produces the cumulative product of (1-beta) up to that
-                      part of the diffusion process.
-    :param max_beta: the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-    """
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return np.array(betas)
-
-
-def extract_into_tensor(a, t, x_shape):
-    b, *_ = t.shape
-    out = a.gather(-1, t)
-    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
-
-
-def checkpoint(func, inputs, params, flag):
-    """
-    Evaluate a function without caching intermediate activations, allowing for
-    reduced memory at the expense of extra compute in the backward pass.
-    :param func: the function to evaluate.
-    :param inputs: the argument sequence to pass to `func`.
-    :param params: a sequence of parameters `func` depends on but does not
-                   explicitly take as arguments.
-    :param flag: if False, disable gradient checkpointing.
-    """
-    if flag:
-        args = tuple(inputs) + tuple(params)
-        return CheckpointFunction.apply(func, len(inputs), *args)
-    else:
-        return func(*inputs)
-
-
-class CheckpointFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, run_function, length, *args):
-        ctx.run_function = run_function
-        ctx.input_tensors = list(args[:length])
-        ctx.input_params = list(args[length:])
-
-        with torch.no_grad():
-            output_tensors = ctx.run_function(*ctx.input_tensors)
-        return output_tensors
-
-    @staticmethod
-    def backward(ctx, *output_grads):
-        ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
-        with torch.enable_grad():
-            # Fixes a bug where the first op in run_function modifies the
-            # Tensor storage in place, which is not allowed for detach()'d
-            # Tensors.
-            shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
-            output_tensors = ctx.run_function(*shallow_copies)
-        input_grads = torch.autograd.grad(
-            output_tensors,
-            ctx.input_tensors + ctx.input_params,
-            output_grads,
-            allow_unused=True,
-        )
-        del ctx.input_tensors
-        del ctx.input_params
-        del output_tensors
-        return (None, None) + input_grads
-
-
-def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False, use_fp16=True):
-    """
-    Create sinusoidal timestep embeddings.
-    :param timesteps: a 1-D Tensor of N indices, one per batch element.
-                      These may be fractional.
-    :param dim: the dimension of the output.
-    :param max_period: controls the minimum frequency of the embeddings.
-    :return: an [N x dim] Tensor of positional embeddings.
-    """
-    if not repeat_only:
-        half = dim // 2
-        freqs = torch.exp(
-            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
-        ).to(device=timesteps.device)
-        args = timesteps[:, None].float() * freqs[None]
-        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-        if dim % 2:
-            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
-    else:
-        embedding = repeat(timesteps, 'b -> b d', d=dim)
-    if use_fp16:
-        return embedding.half()
-    else:
-        return embedding
-
-
-def zero_module(module):
-    """
-    Zero out the parameters of a module and return it.
-    """
-    for p in module.parameters():
-        p.detach().zero_()
-    return module
-
-
-def scale_module(module, scale):
-    """
-    Scale the parameters of a module and return it.
-    """
-    for p in module.parameters():
-        p.detach().mul_(scale)
-    return module
-
-
-def mean_flat(tensor):
-    """
-    Take the mean over all non-batch dimensions.
-    """
-    return tensor.mean(dim=list(range(1, len(tensor.shape))))
-
-
-def normalization(channels, precision=16):
-    """
-    Make a standard normalization layer.
-    :param channels: number of input channels.
-    :return: an nn.Module for normalization.
-    """
-    if precision == 16:
-        return GroupNorm16(16, channels)
-    else:
-        return GroupNorm32(32, channels)
-
-
-# PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
-class SiLU(nn.Module):
-    def forward(self, x):
-        return x * torch.sigmoid(x)
-
-class GroupNorm16(nn.GroupNorm):
-    def forward(self, x):
-        return super().forward(x.half()).type(x.dtype)
-
-class GroupNorm32(nn.GroupNorm):
-    def forward(self, x):
-        return super().forward(x.float()).type(x.dtype)
-
-def conv_nd(dims, *args, **kwargs):
-    """
-    Create a 1D, 2D, or 3D convolution module.
-    """
-    if dims == 1:
-        return nn.Conv1d(*args, **kwargs)
-    elif dims == 2:
-        return nn.Conv2d(*args, **kwargs)
-    elif dims == 3:
-        return nn.Conv3d(*args, **kwargs)
-    raise ValueError(f"unsupported dimensions: {dims}")
-
-
-def linear(*args, **kwargs):
-    """
-    Create a linear module.
-    """
-    return nn.Linear(*args, **kwargs)
-
-
-def avg_pool_nd(dims, *args, **kwargs):
-    """
-    Create a 1D, 2D, or 3D average pooling module.
-    """
-    if dims == 1:
-        return nn.AvgPool1d(*args, **kwargs)
-    elif dims == 2:
-        return nn.AvgPool2d(*args, **kwargs)
-    elif dims == 3:
-        return nn.AvgPool3d(*args, **kwargs)
-    raise ValueError(f"unsupported dimensions: {dims}")
-
-
-class HybridConditioner(nn.Module):
-
-    def __init__(self, c_concat_config, c_crossattn_config):
-        super().__init__()
-        self.concat_conditioner = instantiate_from_config(c_concat_config)
-        self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)
-
-    def forward(self, c_concat, c_crossattn):
-        c_concat = self.concat_conditioner(c_concat)
-        c_crossattn = self.crossattn_conditioner(c_crossattn)
-        return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]}
-
-
-def noise_like(shape, device, repeat=False):
-    repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
-    noise = lambda: torch.randn(shape, device=device)
-    return repeat_noise() if repeat else noise()
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/distributions/__init__.py b/examples/tutorial/stable_diffusion/ldm/modules/distributions/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/distributions/distributions.py b/examples/tutorial/stable_diffusion/ldm/modules/distributions/distributions.py
deleted file mode 100644
index f2b8ef901130..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/distributions/distributions.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import torch
-import numpy as np
-
-
-class AbstractDistribution:
-    def sample(self):
-        raise NotImplementedError()
-
-    def mode(self):
-        raise NotImplementedError()
-
-
-class DiracDistribution(AbstractDistribution):
-    def __init__(self, value):
-        self.value = value
-
-    def sample(self):
-        return self.value
-
-    def mode(self):
-        return self.value
-
-
-class DiagonalGaussianDistribution(object):
-    def __init__(self, parameters, deterministic=False):
-        self.parameters = parameters
-        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
-        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
-        self.deterministic = deterministic
-        self.std = torch.exp(0.5 * self.logvar)
-        self.var = torch.exp(self.logvar)
-        if self.deterministic:
-            self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
-
-    def sample(self):
-        x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
-        return x
-
-    def kl(self, other=None):
-        if self.deterministic:
-            return torch.Tensor([0.])
-        else:
-            if other is None:
-                return 0.5 * torch.sum(torch.pow(self.mean, 2)
-                                       + self.var - 1.0 - self.logvar,
-                                       dim=[1, 2, 3])
-            else:
-                return 0.5 * torch.sum(
-                    torch.pow(self.mean - other.mean, 2) / other.var
-                    + self.var / other.var - 1.0 - self.logvar + other.logvar,
-                    dim=[1, 2, 3])
-
-    def nll(self, sample, dims=[1,2,3]):
-        if self.deterministic:
-            return torch.Tensor([0.])
-        logtwopi = np.log(2.0 * np.pi)
-        return 0.5 * torch.sum(
-            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
-            dim=dims)
-
-    def mode(self):
-        return self.mean
-
-
-def normal_kl(mean1, logvar1, mean2, logvar2):
-    """
-    source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
-    Compute the KL divergence between two gaussians.
-    Shapes are automatically broadcasted, so batches can be compared to
-    scalars, among other use cases.
-    """
-    tensor = None
-    for obj in (mean1, logvar1, mean2, logvar2):
-        if isinstance(obj, torch.Tensor):
-            tensor = obj
-            break
-    assert tensor is not None, "at least one argument must be a Tensor"
-
-    # Force variances to be Tensors. Broadcasting helps convert scalars to
-    # Tensors, but it does not work for torch.exp().
-    logvar1, logvar2 = [
-        x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
-        for x in (logvar1, logvar2)
-    ]
-
-    return 0.5 * (
-        -1.0
-        + logvar2
-        - logvar1
-        + torch.exp(logvar1 - logvar2)
-        + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
-    )
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/ema.py b/examples/tutorial/stable_diffusion/ldm/modules/ema.py
deleted file mode 100644
index c8c75af43565..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/ema.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import torch
-from torch import nn
-
-
-class LitEma(nn.Module):
-    def __init__(self, model, decay=0.9999, use_num_upates=True):
-        super().__init__()
-        if decay < 0.0 or decay > 1.0:
-            raise ValueError('Decay must be between 0 and 1')
-
-        self.m_name2s_name = {}
-        self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32))
-        self.register_buffer('num_updates', torch.tensor(0,dtype=torch.int) if use_num_upates
-                             else torch.tensor(-1,dtype=torch.int))
-
-        for name, p in model.named_parameters():
-            if p.requires_grad:
-                #remove as '.'-character is not allowed in buffers
-                s_name = name.replace('.','')
-                self.m_name2s_name.update({name:s_name})
-                self.register_buffer(s_name,p.clone().detach().data)
-
-        self.collected_params = []
-
-    def forward(self,model):
-        decay = self.decay
-
-        if self.num_updates >= 0:
-            self.num_updates += 1
-            decay = min(self.decay,(1 + self.num_updates) / (10 + self.num_updates))
-
-        one_minus_decay = 1.0 - decay
-
-        with torch.no_grad():
-            m_param = dict(model.named_parameters())
-            shadow_params = dict(self.named_buffers())
-
-            for key in m_param:
-                if m_param[key].requires_grad:
-                    sname = self.m_name2s_name[key]
-                    shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
-                    shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key]))
-                else:
-                    assert not key in self.m_name2s_name
-
-    def copy_to(self, model):
-        m_param = dict(model.named_parameters())
-        shadow_params = dict(self.named_buffers())
-        for key in m_param:
-            if m_param[key].requires_grad:
-                m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
-            else:
-                assert not key in self.m_name2s_name
-
-    def store(self, parameters):
-        """
-        Save the current parameters for restoring later.
-        Args:
-          parameters: Iterable of `torch.nn.Parameter`; the parameters to be
-            temporarily stored.
-        """
-        self.collected_params = [param.clone() for param in parameters]
-
-    def restore(self, parameters):
-        """
-        Restore the parameters stored with the `store` method.
-        Useful to validate the model with EMA parameters without affecting the
-        original optimization process. Store the parameters before the
-        `copy_to` method. After validation (or model saving), use this to
-        restore the former parameters.
-        Args:
-          parameters: Iterable of `torch.nn.Parameter`; the parameters to be
-            updated with the stored parameters.
-        """
-        for c_param, param in zip(self.collected_params, parameters):
-            param.data.copy_(c_param.data)
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/encoders/__init__.py b/examples/tutorial/stable_diffusion/ldm/modules/encoders/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/encoders/modules.py b/examples/tutorial/stable_diffusion/ldm/modules/encoders/modules.py
deleted file mode 100644
index 8cfc01e5ded4..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/encoders/modules.py
+++ /dev/null
@@ -1,264 +0,0 @@
-import types 
-
-import torch
-import torch.nn as nn
-from functools import partial
-import clip
-from einops import rearrange, repeat
-from transformers import CLIPTokenizer, CLIPTextModel, CLIPTextConfig
-import kornia
-from transformers.models.clip.modeling_clip import CLIPTextTransformer
-
-from ldm.modules.x_transformer import Encoder, TransformerWrapper  # TODO: can we directly rely on lucidrains code and simply add this as a reuirement? --> test
-
-
-class AbstractEncoder(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def encode(self, *args, **kwargs):
-        raise NotImplementedError
-
-
-
-class ClassEmbedder(nn.Module):
-    def __init__(self, embed_dim, n_classes=1000, key='class'):
-        super().__init__()
-        self.key = key
-        self.embedding = nn.Embedding(n_classes, embed_dim)
-
-    def forward(self, batch, key=None):
-        if key is None:
-            key = self.key
-        # this is for use in crossattn
-        c = batch[key][:, None]
-        c = self.embedding(c)
-        return c
-
-
-class TransformerEmbedder(AbstractEncoder):
-    """Some transformer encoder layers"""
-    def __init__(self, n_embed, n_layer, vocab_size, max_seq_len=77, device="cuda"):
-        super().__init__()
-        self.device = device
-        self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len,
-                                              attn_layers=Encoder(dim=n_embed, depth=n_layer))
-
-    def forward(self, tokens):
-        tokens = tokens.to(self.device)  # meh
-        z = self.transformer(tokens, return_embeddings=True)
-        return z
-
-    def encode(self, x):
-        return self(x)
-
-
-class BERTTokenizer(AbstractEncoder):
-    """ Uses a pretrained BERT tokenizer by huggingface. Vocab size: 30522 (?)"""
-    def __init__(self, device="cuda", vq_interface=True, max_length=77):
-        super().__init__()
-        from transformers import BertTokenizerFast  # TODO: add to reuquirements
-        self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
-        self.device = device
-        self.vq_interface = vq_interface
-        self.max_length = max_length
-
-    def forward(self, text):
-        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
-                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
-        tokens = batch_encoding["input_ids"].to(self.device)
-        return tokens
-
-    @torch.no_grad()
-    def encode(self, text):
-        tokens = self(text)
-        if not self.vq_interface:
-            return tokens
-        return None, None, [None, None, tokens]
-
-    def decode(self, text):
-        return text
-
-
-class BERTEmbedder(AbstractEncoder):
-    """Uses the BERT tokenizr model and add some transformer encoder layers"""
-    def __init__(self, n_embed, n_layer, vocab_size=30522, max_seq_len=77,
-                 device="cuda",use_tokenizer=True, embedding_dropout=0.0):
-        super().__init__()
-        self.use_tknz_fn = use_tokenizer
-        if self.use_tknz_fn:
-            self.tknz_fn = BERTTokenizer(vq_interface=False, max_length=max_seq_len)
-        self.device = device
-        self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len,
-                                              attn_layers=Encoder(dim=n_embed, depth=n_layer),
-                                              emb_dropout=embedding_dropout)
-
-    def forward(self, text):
-        if self.use_tknz_fn:
-            tokens = self.tknz_fn(text)#.to(self.device)
-        else:
-            tokens = text
-        z = self.transformer(tokens, return_embeddings=True)
-        return z
-
-    def encode(self, text):
-        # output of length 77
-        return self(text)
-
-
-class SpatialRescaler(nn.Module):
-    def __init__(self,
-                 n_stages=1,
-                 method='bilinear',
-                 multiplier=0.5,
-                 in_channels=3,
-                 out_channels=None,
-                 bias=False):
-        super().__init__()
-        self.n_stages = n_stages
-        assert self.n_stages >= 0
-        assert method in ['nearest','linear','bilinear','trilinear','bicubic','area']
-        self.multiplier = multiplier
-        self.interpolator = partial(torch.nn.functional.interpolate, mode=method)
-        self.remap_output = out_channels is not None
-        if self.remap_output:
-            print(f'Spatial Rescaler mapping from {in_channels} to {out_channels} channels after resizing.')
-            self.channel_mapper = nn.Conv2d(in_channels,out_channels,1,bias=bias)
-
-    def forward(self,x):
-        for stage in range(self.n_stages):
-            x = self.interpolator(x, scale_factor=self.multiplier)
-
-
-        if self.remap_output:
-            x = self.channel_mapper(x)
-        return x
-
-    def encode(self, x):
-        return self(x)
-
-
-class CLIPTextModelZero(CLIPTextModel):
-    config_class = CLIPTextConfig
-
-    def __init__(self, config: CLIPTextConfig):
-        super().__init__(config)
-        self.text_model = CLIPTextTransformerZero(config)
-
-class CLIPTextTransformerZero(CLIPTextTransformer):
-    def _build_causal_attention_mask(self, bsz, seq_len):
-        # lazily create causal attention mask, with full attention between the vision tokens
-        # pytorch uses additive attention mask; fill with -inf
-        mask = torch.empty(bsz, seq_len, seq_len)
-        mask.fill_(float("-inf"))
-        mask.triu_(1)  # zero out the lower diagonal
-        mask = mask.unsqueeze(1)  # expand mask
-        return mask.half()
-
-class FrozenCLIPEmbedder(AbstractEncoder):
-    """Uses the CLIP transformer encoder for text (from Hugging Face)"""
-    def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77, use_fp16=True):
-        super().__init__()
-        self.tokenizer = CLIPTokenizer.from_pretrained(version)
-        
-        if use_fp16:
-            self.transformer = CLIPTextModelZero.from_pretrained(version)
-        else:
-            self.transformer = CLIPTextModel.from_pretrained(version)
-
-        # print(self.transformer.modules())
-        # print("check model dtyoe: {}, {}".format(self.tokenizer.dtype, self.transformer.dtype))
-        self.device = device
-        self.max_length = max_length
-        self.freeze() 
-
-    def freeze(self):
-        self.transformer = self.transformer.eval()
-        for param in self.parameters():
-            param.requires_grad = False
-
-    def forward(self, text):
-        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
-                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
-        # tokens = batch_encoding["input_ids"].to(self.device)
-        tokens = batch_encoding["input_ids"].to(self.device)
-        # print("token type: {}".format(tokens.dtype))
-        outputs = self.transformer(input_ids=tokens)
-
-        z = outputs.last_hidden_state
-        return z
-
-    def encode(self, text):
-        return self(text)
-
-
-class FrozenCLIPTextEmbedder(nn.Module):
-    """
-    Uses the CLIP transformer encoder for text.
-    """
-    def __init__(self, version='ViT-L/14', device="cuda", max_length=77, n_repeat=1, normalize=True):
-        super().__init__()
-        self.model, _ = clip.load(version, jit=False, device="cpu")
-        self.device = device
-        self.max_length = max_length
-        self.n_repeat = n_repeat
-        self.normalize = normalize
-
-    def freeze(self):
-        self.model = self.model.eval()
-        for param in self.parameters():
-            param.requires_grad = False
-
-    def forward(self, text):
-        tokens = clip.tokenize(text).to(self.device)
-        z = self.model.encode_text(tokens)
-        if self.normalize:
-            z = z / torch.linalg.norm(z, dim=1, keepdim=True)
-        return z
-
-    def encode(self, text):
-        z = self(text)
-        if z.ndim==2:
-            z = z[:, None, :]
-        z = repeat(z, 'b 1 d -> b k d', k=self.n_repeat)
-        return z
-
-
-class FrozenClipImageEmbedder(nn.Module):
-    """
-        Uses the CLIP image encoder.
-        """
-    def __init__(
-            self,
-            model,
-            jit=False,
-            device='cuda' if torch.cuda.is_available() else 'cpu',
-            antialias=False,
-        ):
-        super().__init__()
-        self.model, _ = clip.load(name=model, device=device, jit=jit)
-
-        self.antialias = antialias
-
-        self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False)
-        self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False)
-
-    def preprocess(self, x):
-        # normalize to [0,1]
-        x = kornia.geometry.resize(x, (224, 224),
-                                   interpolation='bicubic',align_corners=True,
-                                   antialias=self.antialias)
-        x = (x + 1.) / 2.
-        # renormalize according to clip
-        x = kornia.enhance.normalize(x, self.mean, self.std)
-        return x
-
-    def forward(self, x):
-        # x is assumed to be in range [-1,1]
-        return self.model.encode_image(self.preprocess(x))
-
-
-if __name__ == "__main__":
-    from ldm.util import count_params
-    model = FrozenCLIPEmbedder()
-    count_params(model, verbose=True)
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/flash_attention.py b/examples/tutorial/stable_diffusion/ldm/modules/flash_attention.py
deleted file mode 100644
index 2a7a73879857..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/flash_attention.py
+++ /dev/null
@@ -1,50 +0,0 @@
-"""
-Fused Attention
-===============
-This is a Triton implementation of the Flash Attention algorithm
-(see: Dao et al., https://arxiv.org/pdf/2205.14135v2.pdf; Rabe and Staats https://arxiv.org/pdf/2112.05682v2.pdf; Triton https://github.com/openai/triton)
-"""
-
-import torch
-try:
-    from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func, flash_attn_unpadded_kvpacked_func
-except ImportError:
-    raise ImportError('please install flash_attn from https://github.com/HazyResearch/flash-attention')
-
-
-
-def flash_attention_qkv(qkv, sm_scale, batch_size, seq_len):
-    """
-    Arguments:
-        qkv: (batch*seq, 3, nheads, headdim)
-        batch_size: int.
-        seq_len: int.
-        sm_scale: float. The scaling of QK^T before applying softmax.
-    Return:
-        out: (total, nheads, headdim).
-    """
-    max_s = seq_len
-    cu_seqlens = torch.arange(0, (batch_size + 1) * seq_len, step=seq_len, dtype=torch.int32,
-        device=qkv.device)
-    out = flash_attn_unpadded_qkvpacked_func(
-        qkv, cu_seqlens, max_s, 0.0,
-        softmax_scale=sm_scale, causal=False
-    )
-    return out
-
-
-def flash_attention_q_kv(q, kv, sm_scale, batch_size, q_seqlen, kv_seqlen):
-    """
-    Arguments:
-        q: (batch*seq, nheads, headdim)
-        kv: (batch*seq, 2, nheads, headdim)
-        batch_size: int.
-        seq_len: int.
-        sm_scale: float. The scaling of QK^T before applying softmax.
-    Return:
-        out: (total, nheads, headdim).
-    """
-    cu_seqlens_q = torch.arange(0, (batch_size + 1) * q_seqlen, step=q_seqlen, dtype=torch.int32, device=q.device)
-    cu_seqlens_k = torch.arange(0, (batch_size + 1) * kv_seqlen, step=kv_seqlen, dtype=torch.int32, device=kv.device)
-    out = flash_attn_unpadded_kvpacked_func(q, kv, cu_seqlens_q, cu_seqlens_k, q_seqlen, kv_seqlen, 0.0, sm_scale)
-    return out
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/__init__.py b/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/__init__.py
deleted file mode 100644
index 7836cada81f9..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr
-from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/bsrgan.py b/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/bsrgan.py
deleted file mode 100644
index 32ef56169978..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/bsrgan.py
+++ /dev/null
@@ -1,730 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-# --------------------------------------------
-# Super-Resolution
-# --------------------------------------------
-#
-# Kai Zhang (cskaizhang@gmail.com)
-# https://github.com/cszn
-# From 2019/03--2021/08
-# --------------------------------------------
-"""
-
-import numpy as np
-import cv2
-import torch
-
-from functools import partial
-import random
-from scipy import ndimage
-import scipy
-import scipy.stats as ss
-from scipy.interpolate import interp2d
-from scipy.linalg import orth
-import albumentations
-
-import ldm.modules.image_degradation.utils_image as util
-
-
-def modcrop_np(img, sf):
-    '''
-    Args:
-        img: numpy image, WxH or WxHxC
-        sf: scale factor
-    Return:
-        cropped image
-    '''
-    w, h = img.shape[:2]
-    im = np.copy(img)
-    return im[:w - w % sf, :h - h % sf, ...]
-
-
-"""
-# --------------------------------------------
-# anisotropic Gaussian kernels
-# --------------------------------------------
-"""
-
-
-def analytic_kernel(k):
-    """Calculate the X4 kernel from the X2 kernel (for proof see appendix in paper)"""
-    k_size = k.shape[0]
-    # Calculate the big kernels size
-    big_k = np.zeros((3 * k_size - 2, 3 * k_size - 2))
-    # Loop over the small kernel to fill the big one
-    for r in range(k_size):
-        for c in range(k_size):
-            big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k
-    # Crop the edges of the big kernel to ignore very small values and increase run time of SR
-    crop = k_size // 2
-    cropped_big_k = big_k[crop:-crop, crop:-crop]
-    # Normalize to 1
-    return cropped_big_k / cropped_big_k.sum()
-
-
-def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
-    """ generate an anisotropic Gaussian kernel
-    Args:
-        ksize : e.g., 15, kernel size
-        theta : [0,  pi], rotation angle range
-        l1    : [0.1,50], scaling of eigenvalues
-        l2    : [0.1,l1], scaling of eigenvalues
-        If l1 = l2, will get an isotropic Gaussian kernel.
-    Returns:
-        k     : kernel
-    """
-
-    v = np.dot(np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), np.array([1., 0.]))
-    V = np.array([[v[0], v[1]], [v[1], -v[0]]])
-    D = np.array([[l1, 0], [0, l2]])
-    Sigma = np.dot(np.dot(V, D), np.linalg.inv(V))
-    k = gm_blur_kernel(mean=[0, 0], cov=Sigma, size=ksize)
-
-    return k
-
-
-def gm_blur_kernel(mean, cov, size=15):
-    center = size / 2.0 + 0.5
-    k = np.zeros([size, size])
-    for y in range(size):
-        for x in range(size):
-            cy = y - center + 1
-            cx = x - center + 1
-            k[y, x] = ss.multivariate_normal.pdf([cx, cy], mean=mean, cov=cov)
-
-    k = k / np.sum(k)
-    return k
-
-
-def shift_pixel(x, sf, upper_left=True):
-    """shift pixel for super-resolution with different scale factors
-    Args:
-        x: WxHxC or WxH
-        sf: scale factor
-        upper_left: shift direction
-    """
-    h, w = x.shape[:2]
-    shift = (sf - 1) * 0.5
-    xv, yv = np.arange(0, w, 1.0), np.arange(0, h, 1.0)
-    if upper_left:
-        x1 = xv + shift
-        y1 = yv + shift
-    else:
-        x1 = xv - shift
-        y1 = yv - shift
-
-    x1 = np.clip(x1, 0, w - 1)
-    y1 = np.clip(y1, 0, h - 1)
-
-    if x.ndim == 2:
-        x = interp2d(xv, yv, x)(x1, y1)
-    if x.ndim == 3:
-        for i in range(x.shape[-1]):
-            x[:, :, i] = interp2d(xv, yv, x[:, :, i])(x1, y1)
-
-    return x
-
-
-def blur(x, k):
-    '''
-    x: image, NxcxHxW
-    k: kernel, Nx1xhxw
-    '''
-    n, c = x.shape[:2]
-    p1, p2 = (k.shape[-2] - 1) // 2, (k.shape[-1] - 1) // 2
-    x = torch.nn.functional.pad(x, pad=(p1, p2, p1, p2), mode='replicate')
-    k = k.repeat(1, c, 1, 1)
-    k = k.view(-1, 1, k.shape[2], k.shape[3])
-    x = x.view(1, -1, x.shape[2], x.shape[3])
-    x = torch.nn.functional.conv2d(x, k, bias=None, stride=1, padding=0, groups=n * c)
-    x = x.view(n, c, x.shape[2], x.shape[3])
-
-    return x
-
-
-def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var=0.6, max_var=10., noise_level=0):
-    """"
-    # modified version of https://github.com/assafshocher/BlindSR_dataset_generator
-    # Kai Zhang
-    # min_var = 0.175 * sf  # variance of the gaussian kernel will be sampled between min_var and max_var
-    # max_var = 2.5 * sf
-    """
-    # Set random eigen-vals (lambdas) and angle (theta) for COV matrix
-    lambda_1 = min_var + np.random.rand() * (max_var - min_var)
-    lambda_2 = min_var + np.random.rand() * (max_var - min_var)
-    theta = np.random.rand() * np.pi  # random theta
-    noise = -noise_level + np.random.rand(*k_size) * noise_level * 2
-
-    # Set COV matrix using Lambdas and Theta
-    LAMBDA = np.diag([lambda_1, lambda_2])
-    Q = np.array([[np.cos(theta), -np.sin(theta)],
-                  [np.sin(theta), np.cos(theta)]])
-    SIGMA = Q @ LAMBDA @ Q.T
-    INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :]
-
-    # Set expectation position (shifting kernel for aligned image)
-    MU = k_size // 2 - 0.5 * (scale_factor - 1)  # - 0.5 * (scale_factor - k_size % 2)
-    MU = MU[None, None, :, None]
-
-    # Create meshgrid for Gaussian
-    [X, Y] = np.meshgrid(range(k_size[0]), range(k_size[1]))
-    Z = np.stack([X, Y], 2)[:, :, :, None]
-
-    # Calcualte Gaussian for every pixel of the kernel
-    ZZ = Z - MU
-    ZZ_t = ZZ.transpose(0, 1, 3, 2)
-    raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise)
-
-    # shift the kernel so it will be centered
-    # raw_kernel_centered = kernel_shift(raw_kernel, scale_factor)
-
-    # Normalize the kernel and return
-    # kernel = raw_kernel_centered / np.sum(raw_kernel_centered)
-    kernel = raw_kernel / np.sum(raw_kernel)
-    return kernel
-
-
-def fspecial_gaussian(hsize, sigma):
-    hsize = [hsize, hsize]
-    siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0]
-    std = sigma
-    [x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
-    arg = -(x * x + y * y) / (2 * std * std)
-    h = np.exp(arg)
-    h[h < scipy.finfo(float).eps * h.max()] = 0
-    sumh = h.sum()
-    if sumh != 0:
-        h = h / sumh
-    return h
-
-
-def fspecial_laplacian(alpha):
-    alpha = max([0, min([alpha, 1])])
-    h1 = alpha / (alpha + 1)
-    h2 = (1 - alpha) / (alpha + 1)
-    h = [[h1, h2, h1], [h2, -4 / (alpha + 1), h2], [h1, h2, h1]]
-    h = np.array(h)
-    return h
-
-
-def fspecial(filter_type, *args, **kwargs):
-    '''
-    python code from:
-    https://github.com/ronaldosena/imagens-medicas-2/blob/40171a6c259edec7827a6693a93955de2bd39e76/Aulas/aula_2_-_uniform_filter/matlab_fspecial.py
-    '''
-    if filter_type == 'gaussian':
-        return fspecial_gaussian(*args, **kwargs)
-    if filter_type == 'laplacian':
-        return fspecial_laplacian(*args, **kwargs)
-
-
-"""
-# --------------------------------------------
-# degradation models
-# --------------------------------------------
-"""
-
-
-def bicubic_degradation(x, sf=3):
-    '''
-    Args:
-        x: HxWxC image, [0, 1]
-        sf: down-scale factor
-    Return:
-        bicubicly downsampled LR image
-    '''
-    x = util.imresize_np(x, scale=1 / sf)
-    return x
-
-
-def srmd_degradation(x, k, sf=3):
-    ''' blur + bicubic downsampling
-    Args:
-        x: HxWxC image, [0, 1]
-        k: hxw, double
-        sf: down-scale factor
-    Return:
-        downsampled LR image
-    Reference:
-        @inproceedings{zhang2018learning,
-          title={Learning a single convolutional super-resolution network for multiple degradations},
-          author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
-          booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
-          pages={3262--3271},
-          year={2018}
-        }
-    '''
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')  # 'nearest' | 'mirror'
-    x = bicubic_degradation(x, sf=sf)
-    return x
-
-
-def dpsr_degradation(x, k, sf=3):
-    ''' bicubic downsampling + blur
-    Args:
-        x: HxWxC image, [0, 1]
-        k: hxw, double
-        sf: down-scale factor
-    Return:
-        downsampled LR image
-    Reference:
-        @inproceedings{zhang2019deep,
-          title={Deep Plug-and-Play Super-Resolution for Arbitrary Blur Kernels},
-          author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
-          booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
-          pages={1671--1681},
-          year={2019}
-        }
-    '''
-    x = bicubic_degradation(x, sf=sf)
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
-    return x
-
-
-def classical_degradation(x, k, sf=3):
-    ''' blur + downsampling
-    Args:
-        x: HxWxC image, [0, 1]/[0, 255]
-        k: hxw, double
-        sf: down-scale factor
-    Return:
-        downsampled LR image
-    '''
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
-    # x = filters.correlate(x, np.expand_dims(np.flip(k), axis=2))
-    st = 0
-    return x[st::sf, st::sf, ...]
-
-
-def add_sharpening(img, weight=0.5, radius=50, threshold=10):
-    """USM sharpening. borrowed from real-ESRGAN
-    Input image: I; Blurry image: B.
-    1. K = I + weight * (I - B)
-    2. Mask = 1 if abs(I - B) > threshold, else: 0
-    3. Blur mask:
-    4. Out = Mask * K + (1 - Mask) * I
-    Args:
-        img (Numpy array): Input image, HWC, BGR; float32, [0, 1].
-        weight (float): Sharp weight. Default: 1.
-        radius (float): Kernel size of Gaussian blur. Default: 50.
-        threshold (int):
-    """
-    if radius % 2 == 0:
-        radius += 1
-    blur = cv2.GaussianBlur(img, (radius, radius), 0)
-    residual = img - blur
-    mask = np.abs(residual) * 255 > threshold
-    mask = mask.astype('float32')
-    soft_mask = cv2.GaussianBlur(mask, (radius, radius), 0)
-
-    K = img + weight * residual
-    K = np.clip(K, 0, 1)
-    return soft_mask * K + (1 - soft_mask) * img
-
-
-def add_blur(img, sf=4):
-    wd2 = 4.0 + sf
-    wd = 2.0 + 0.2 * sf
-    if random.random() < 0.5:
-        l1 = wd2 * random.random()
-        l2 = wd2 * random.random()
-        k = anisotropic_Gaussian(ksize=2 * random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2)
-    else:
-        k = fspecial('gaussian', 2 * random.randint(2, 11) + 3, wd * random.random())
-    img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode='mirror')
-
-    return img
-
-
-def add_resize(img, sf=4):
-    rnum = np.random.rand()
-    if rnum > 0.8:  # up
-        sf1 = random.uniform(1, 2)
-    elif rnum < 0.7:  # down
-        sf1 = random.uniform(0.5 / sf, 1)
-    else:
-        sf1 = 1.0
-    img = cv2.resize(img, (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])), interpolation=random.choice([1, 2, 3]))
-    img = np.clip(img, 0.0, 1.0)
-
-    return img
-
-
-# def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
-#     noise_level = random.randint(noise_level1, noise_level2)
-#     rnum = np.random.rand()
-#     if rnum > 0.6:  # add color Gaussian noise
-#         img += np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
-#     elif rnum < 0.4:  # add grayscale Gaussian noise
-#         img += np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
-#     else:  # add  noise
-#         L = noise_level2 / 255.
-#         D = np.diag(np.random.rand(3))
-#         U = orth(np.random.rand(3, 3))
-#         conv = np.dot(np.dot(np.transpose(U), D), U)
-#         img += np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
-#     img = np.clip(img, 0.0, 1.0)
-#     return img
-
-def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
-    noise_level = random.randint(noise_level1, noise_level2)
-    rnum = np.random.rand()
-    if rnum > 0.6:  # add color Gaussian noise
-        img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
-    elif rnum < 0.4:  # add grayscale Gaussian noise
-        img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
-    else:  # add  noise
-        L = noise_level2 / 255.
-        D = np.diag(np.random.rand(3))
-        U = orth(np.random.rand(3, 3))
-        conv = np.dot(np.dot(np.transpose(U), D), U)
-        img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
-    img = np.clip(img, 0.0, 1.0)
-    return img
-
-
-def add_speckle_noise(img, noise_level1=2, noise_level2=25):
-    noise_level = random.randint(noise_level1, noise_level2)
-    img = np.clip(img, 0.0, 1.0)
-    rnum = random.random()
-    if rnum > 0.6:
-        img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
-    elif rnum < 0.4:
-        img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
-    else:
-        L = noise_level2 / 255.
-        D = np.diag(np.random.rand(3))
-        U = orth(np.random.rand(3, 3))
-        conv = np.dot(np.dot(np.transpose(U), D), U)
-        img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
-    img = np.clip(img, 0.0, 1.0)
-    return img
-
-
-def add_Poisson_noise(img):
-    img = np.clip((img * 255.0).round(), 0, 255) / 255.
-    vals = 10 ** (2 * random.random() + 2.0)  # [2, 4]
-    if random.random() < 0.5:
-        img = np.random.poisson(img * vals).astype(np.float32) / vals
-    else:
-        img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114])
-        img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.
-        noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray
-        img += noise_gray[:, :, np.newaxis]
-    img = np.clip(img, 0.0, 1.0)
-    return img
-
-
-def add_JPEG_noise(img):
-    quality_factor = random.randint(30, 95)
-    img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR)
-    result, encimg = cv2.imencode('.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
-    img = cv2.imdecode(encimg, 1)
-    img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB)
-    return img
-
-
-def random_crop(lq, hq, sf=4, lq_patchsize=64):
-    h, w = lq.shape[:2]
-    rnd_h = random.randint(0, h - lq_patchsize)
-    rnd_w = random.randint(0, w - lq_patchsize)
-    lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :]
-
-    rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf)
-    hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize * sf, :]
-    return lq, hq
-
-
-def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
-    """
-    This is the degradation model of BSRGAN from the paper
-    "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
-    ----------
-    img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf)
-    sf: scale factor
-    isp_model: camera ISP model
-    Returns
-    -------
-    img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
-    hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
-    """
-    isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
-    sf_ori = sf
-
-    h1, w1 = img.shape[:2]
-    img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
-    h, w = img.shape[:2]
-
-    if h < lq_patchsize * sf or w < lq_patchsize * sf:
-        raise ValueError(f'img size ({h1}X{w1}) is too small!')
-
-    hq = img.copy()
-
-    if sf == 4 and random.random() < scale2_prob:  # downsample1
-        if np.random.rand() < 0.5:
-            img = cv2.resize(img, (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
-                             interpolation=random.choice([1, 2, 3]))
-        else:
-            img = util.imresize_np(img, 1 / 2, True)
-        img = np.clip(img, 0.0, 1.0)
-        sf = 2
-
-    shuffle_order = random.sample(range(7), 7)
-    idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
-    if idx1 > idx2:  # keep downsample3 last
-        shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
-
-    for i in shuffle_order:
-
-        if i == 0:
-            img = add_blur(img, sf=sf)
-
-        elif i == 1:
-            img = add_blur(img, sf=sf)
-
-        elif i == 2:
-            a, b = img.shape[1], img.shape[0]
-            # downsample2
-            if random.random() < 0.75:
-                sf1 = random.uniform(1, 2 * sf)
-                img = cv2.resize(img, (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
-                                 interpolation=random.choice([1, 2, 3]))
-            else:
-                k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
-                k_shifted = shift_pixel(k, sf)
-                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
-                img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode='mirror')
-                img = img[0::sf, 0::sf, ...]  # nearest downsampling
-            img = np.clip(img, 0.0, 1.0)
-
-        elif i == 3:
-            # downsample3
-            img = cv2.resize(img, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
-            img = np.clip(img, 0.0, 1.0)
-
-        elif i == 4:
-            # add Gaussian noise
-            img = add_Gaussian_noise(img, noise_level1=2, noise_level2=25)
-
-        elif i == 5:
-            # add JPEG noise
-            if random.random() < jpeg_prob:
-                img = add_JPEG_noise(img)
-
-        elif i == 6:
-            # add processed camera sensor noise
-            if random.random() < isp_prob and isp_model is not None:
-                with torch.no_grad():
-                    img, hq = isp_model.forward(img.copy(), hq)
-
-    # add final JPEG compression noise
-    img = add_JPEG_noise(img)
-
-    # random crop
-    img, hq = random_crop(img, hq, sf_ori, lq_patchsize)
-
-    return img, hq
-
-
-# todo no isp_model?
-def degradation_bsrgan_variant(image, sf=4, isp_model=None):
-    """
-    This is the degradation model of BSRGAN from the paper
-    "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
-    ----------
-    sf: scale factor
-    isp_model: camera ISP model
-    Returns
-    -------
-    img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
-    hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
-    """
-    image = util.uint2single(image)
-    isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
-    sf_ori = sf
-
-    h1, w1 = image.shape[:2]
-    image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
-    h, w = image.shape[:2]
-
-    hq = image.copy()
-
-    if sf == 4 and random.random() < scale2_prob:  # downsample1
-        if np.random.rand() < 0.5:
-            image = cv2.resize(image, (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
-                               interpolation=random.choice([1, 2, 3]))
-        else:
-            image = util.imresize_np(image, 1 / 2, True)
-        image = np.clip(image, 0.0, 1.0)
-        sf = 2
-
-    shuffle_order = random.sample(range(7), 7)
-    idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
-    if idx1 > idx2:  # keep downsample3 last
-        shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
-
-    for i in shuffle_order:
-
-        if i == 0:
-            image = add_blur(image, sf=sf)
-
-        elif i == 1:
-            image = add_blur(image, sf=sf)
-
-        elif i == 2:
-            a, b = image.shape[1], image.shape[0]
-            # downsample2
-            if random.random() < 0.75:
-                sf1 = random.uniform(1, 2 * sf)
-                image = cv2.resize(image, (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])),
-                                   interpolation=random.choice([1, 2, 3]))
-            else:
-                k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
-                k_shifted = shift_pixel(k, sf)
-                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
-                image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode='mirror')
-                image = image[0::sf, 0::sf, ...]  # nearest downsampling
-            image = np.clip(image, 0.0, 1.0)
-
-        elif i == 3:
-            # downsample3
-            image = cv2.resize(image, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
-            image = np.clip(image, 0.0, 1.0)
-
-        elif i == 4:
-            # add Gaussian noise
-            image = add_Gaussian_noise(image, noise_level1=2, noise_level2=25)
-
-        elif i == 5:
-            # add JPEG noise
-            if random.random() < jpeg_prob:
-                image = add_JPEG_noise(image)
-
-        # elif i == 6:
-        #     # add processed camera sensor noise
-        #     if random.random() < isp_prob and isp_model is not None:
-        #         with torch.no_grad():
-        #             img, hq = isp_model.forward(img.copy(), hq)
-
-    # add final JPEG compression noise
-    image = add_JPEG_noise(image)
-    image = util.single2uint(image)
-    example = {"image":image}
-    return example
-
-
-# TODO incase there is a pickle error one needs to replace a += x with a = a + x in add_speckle_noise etc...
-def degradation_bsrgan_plus(img, sf=4, shuffle_prob=0.5, use_sharp=True, lq_patchsize=64, isp_model=None):
-    """
-    This is an extended degradation model by combining
-    the degradation models of BSRGAN and Real-ESRGAN
-    ----------
-    img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf)
-    sf: scale factor
-    use_shuffle: the degradation shuffle
-    use_sharp: sharpening the img
-    Returns
-    -------
-    img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
-    hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
-    """
-
-    h1, w1 = img.shape[:2]
-    img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
-    h, w = img.shape[:2]
-
-    if h < lq_patchsize * sf or w < lq_patchsize * sf:
-        raise ValueError(f'img size ({h1}X{w1}) is too small!')
-
-    if use_sharp:
-        img = add_sharpening(img)
-    hq = img.copy()
-
-    if random.random() < shuffle_prob:
-        shuffle_order = random.sample(range(13), 13)
-    else:
-        shuffle_order = list(range(13))
-        # local shuffle for noise, JPEG is always the last one
-        shuffle_order[2:6] = random.sample(shuffle_order[2:6], len(range(2, 6)))
-        shuffle_order[9:13] = random.sample(shuffle_order[9:13], len(range(9, 13)))
-
-    poisson_prob, speckle_prob, isp_prob = 0.1, 0.1, 0.1
-
-    for i in shuffle_order:
-        if i == 0:
-            img = add_blur(img, sf=sf)
-        elif i == 1:
-            img = add_resize(img, sf=sf)
-        elif i == 2:
-            img = add_Gaussian_noise(img, noise_level1=2, noise_level2=25)
-        elif i == 3:
-            if random.random() < poisson_prob:
-                img = add_Poisson_noise(img)
-        elif i == 4:
-            if random.random() < speckle_prob:
-                img = add_speckle_noise(img)
-        elif i == 5:
-            if random.random() < isp_prob and isp_model is not None:
-                with torch.no_grad():
-                    img, hq = isp_model.forward(img.copy(), hq)
-        elif i == 6:
-            img = add_JPEG_noise(img)
-        elif i == 7:
-            img = add_blur(img, sf=sf)
-        elif i == 8:
-            img = add_resize(img, sf=sf)
-        elif i == 9:
-            img = add_Gaussian_noise(img, noise_level1=2, noise_level2=25)
-        elif i == 10:
-            if random.random() < poisson_prob:
-                img = add_Poisson_noise(img)
-        elif i == 11:
-            if random.random() < speckle_prob:
-                img = add_speckle_noise(img)
-        elif i == 12:
-            if random.random() < isp_prob and isp_model is not None:
-                with torch.no_grad():
-                    img, hq = isp_model.forward(img.copy(), hq)
-        else:
-            print('check the shuffle!')
-
-    # resize to desired size
-    img = cv2.resize(img, (int(1 / sf * hq.shape[1]), int(1 / sf * hq.shape[0])),
-                     interpolation=random.choice([1, 2, 3]))
-
-    # add final JPEG compression noise
-    img = add_JPEG_noise(img)
-
-    # random crop
-    img, hq = random_crop(img, hq, sf, lq_patchsize)
-
-    return img, hq
-
-
-if __name__ == '__main__':
-	print("hey")
-	img = util.imread_uint('utils/test.png', 3)
-	print(img)
-	img = util.uint2single(img)
-	print(img)
-	img = img[:448, :448]
-	h = img.shape[0] // 4
-	print("resizing to", h)
-	sf = 4
-	deg_fn = partial(degradation_bsrgan_variant, sf=sf)
-	for i in range(20):
-		print(i)
-		img_lq = deg_fn(img)
-		print(img_lq)
-		img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img)["image"]
-		print(img_lq.shape)
-		print("bicubic", img_lq_bicubic.shape)
-		print(img_hq.shape)
-		lq_nearest = cv2.resize(util.single2uint(img_lq), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
-		                        interpolation=0)
-		lq_bicubic_nearest = cv2.resize(util.single2uint(img_lq_bicubic), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
-		                        interpolation=0)
-		img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
-		util.imsave(img_concat, str(i) + '.png')
-
-
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/bsrgan_light.py b/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/bsrgan_light.py
deleted file mode 100644
index 9e1f823996bf..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/bsrgan_light.py
+++ /dev/null
@@ -1,650 +0,0 @@
-# -*- coding: utf-8 -*-
-import numpy as np
-import cv2
-import torch
-
-from functools import partial
-import random
-from scipy import ndimage
-import scipy
-import scipy.stats as ss
-from scipy.interpolate import interp2d
-from scipy.linalg import orth
-import albumentations
-
-import ldm.modules.image_degradation.utils_image as util
-
-"""
-# --------------------------------------------
-# Super-Resolution
-# --------------------------------------------
-#
-# Kai Zhang (cskaizhang@gmail.com)
-# https://github.com/cszn
-# From 2019/03--2021/08
-# --------------------------------------------
-"""
-
-
-def modcrop_np(img, sf):
-    '''
-    Args:
-        img: numpy image, WxH or WxHxC
-        sf: scale factor
-    Return:
-        cropped image
-    '''
-    w, h = img.shape[:2]
-    im = np.copy(img)
-    return im[:w - w % sf, :h - h % sf, ...]
-
-
-"""
-# --------------------------------------------
-# anisotropic Gaussian kernels
-# --------------------------------------------
-"""
-
-
-def analytic_kernel(k):
-    """Calculate the X4 kernel from the X2 kernel (for proof see appendix in paper)"""
-    k_size = k.shape[0]
-    # Calculate the big kernels size
-    big_k = np.zeros((3 * k_size - 2, 3 * k_size - 2))
-    # Loop over the small kernel to fill the big one
-    for r in range(k_size):
-        for c in range(k_size):
-            big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k
-    # Crop the edges of the big kernel to ignore very small values and increase run time of SR
-    crop = k_size // 2
-    cropped_big_k = big_k[crop:-crop, crop:-crop]
-    # Normalize to 1
-    return cropped_big_k / cropped_big_k.sum()
-
-
-def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
-    """ generate an anisotropic Gaussian kernel
-    Args:
-        ksize : e.g., 15, kernel size
-        theta : [0,  pi], rotation angle range
-        l1    : [0.1,50], scaling of eigenvalues
-        l2    : [0.1,l1], scaling of eigenvalues
-        If l1 = l2, will get an isotropic Gaussian kernel.
-    Returns:
-        k     : kernel
-    """
-
-    v = np.dot(np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), np.array([1., 0.]))
-    V = np.array([[v[0], v[1]], [v[1], -v[0]]])
-    D = np.array([[l1, 0], [0, l2]])
-    Sigma = np.dot(np.dot(V, D), np.linalg.inv(V))
-    k = gm_blur_kernel(mean=[0, 0], cov=Sigma, size=ksize)
-
-    return k
-
-
-def gm_blur_kernel(mean, cov, size=15):
-    center = size / 2.0 + 0.5
-    k = np.zeros([size, size])
-    for y in range(size):
-        for x in range(size):
-            cy = y - center + 1
-            cx = x - center + 1
-            k[y, x] = ss.multivariate_normal.pdf([cx, cy], mean=mean, cov=cov)
-
-    k = k / np.sum(k)
-    return k
-
-
-def shift_pixel(x, sf, upper_left=True):
-    """shift pixel for super-resolution with different scale factors
-    Args:
-        x: WxHxC or WxH
-        sf: scale factor
-        upper_left: shift direction
-    """
-    h, w = x.shape[:2]
-    shift = (sf - 1) * 0.5
-    xv, yv = np.arange(0, w, 1.0), np.arange(0, h, 1.0)
-    if upper_left:
-        x1 = xv + shift
-        y1 = yv + shift
-    else:
-        x1 = xv - shift
-        y1 = yv - shift
-
-    x1 = np.clip(x1, 0, w - 1)
-    y1 = np.clip(y1, 0, h - 1)
-
-    if x.ndim == 2:
-        x = interp2d(xv, yv, x)(x1, y1)
-    if x.ndim == 3:
-        for i in range(x.shape[-1]):
-            x[:, :, i] = interp2d(xv, yv, x[:, :, i])(x1, y1)
-
-    return x
-
-
-def blur(x, k):
-    '''
-    x: image, NxcxHxW
-    k: kernel, Nx1xhxw
-    '''
-    n, c = x.shape[:2]
-    p1, p2 = (k.shape[-2] - 1) // 2, (k.shape[-1] - 1) // 2
-    x = torch.nn.functional.pad(x, pad=(p1, p2, p1, p2), mode='replicate')
-    k = k.repeat(1, c, 1, 1)
-    k = k.view(-1, 1, k.shape[2], k.shape[3])
-    x = x.view(1, -1, x.shape[2], x.shape[3])
-    x = torch.nn.functional.conv2d(x, k, bias=None, stride=1, padding=0, groups=n * c)
-    x = x.view(n, c, x.shape[2], x.shape[3])
-
-    return x
-
-
-def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var=0.6, max_var=10., noise_level=0):
-    """"
-    # modified version of https://github.com/assafshocher/BlindSR_dataset_generator
-    # Kai Zhang
-    # min_var = 0.175 * sf  # variance of the gaussian kernel will be sampled between min_var and max_var
-    # max_var = 2.5 * sf
-    """
-    # Set random eigen-vals (lambdas) and angle (theta) for COV matrix
-    lambda_1 = min_var + np.random.rand() * (max_var - min_var)
-    lambda_2 = min_var + np.random.rand() * (max_var - min_var)
-    theta = np.random.rand() * np.pi  # random theta
-    noise = -noise_level + np.random.rand(*k_size) * noise_level * 2
-
-    # Set COV matrix using Lambdas and Theta
-    LAMBDA = np.diag([lambda_1, lambda_2])
-    Q = np.array([[np.cos(theta), -np.sin(theta)],
-                  [np.sin(theta), np.cos(theta)]])
-    SIGMA = Q @ LAMBDA @ Q.T
-    INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :]
-
-    # Set expectation position (shifting kernel for aligned image)
-    MU = k_size // 2 - 0.5 * (scale_factor - 1)  # - 0.5 * (scale_factor - k_size % 2)
-    MU = MU[None, None, :, None]
-
-    # Create meshgrid for Gaussian
-    [X, Y] = np.meshgrid(range(k_size[0]), range(k_size[1]))
-    Z = np.stack([X, Y], 2)[:, :, :, None]
-
-    # Calcualte Gaussian for every pixel of the kernel
-    ZZ = Z - MU
-    ZZ_t = ZZ.transpose(0, 1, 3, 2)
-    raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise)
-
-    # shift the kernel so it will be centered
-    # raw_kernel_centered = kernel_shift(raw_kernel, scale_factor)
-
-    # Normalize the kernel and return
-    # kernel = raw_kernel_centered / np.sum(raw_kernel_centered)
-    kernel = raw_kernel / np.sum(raw_kernel)
-    return kernel
-
-
-def fspecial_gaussian(hsize, sigma):
-    hsize = [hsize, hsize]
-    siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0]
-    std = sigma
-    [x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
-    arg = -(x * x + y * y) / (2 * std * std)
-    h = np.exp(arg)
-    h[h < scipy.finfo(float).eps * h.max()] = 0
-    sumh = h.sum()
-    if sumh != 0:
-        h = h / sumh
-    return h
-
-
-def fspecial_laplacian(alpha):
-    alpha = max([0, min([alpha, 1])])
-    h1 = alpha / (alpha + 1)
-    h2 = (1 - alpha) / (alpha + 1)
-    h = [[h1, h2, h1], [h2, -4 / (alpha + 1), h2], [h1, h2, h1]]
-    h = np.array(h)
-    return h
-
-
-def fspecial(filter_type, *args, **kwargs):
-    '''
-    python code from:
-    https://github.com/ronaldosena/imagens-medicas-2/blob/40171a6c259edec7827a6693a93955de2bd39e76/Aulas/aula_2_-_uniform_filter/matlab_fspecial.py
-    '''
-    if filter_type == 'gaussian':
-        return fspecial_gaussian(*args, **kwargs)
-    if filter_type == 'laplacian':
-        return fspecial_laplacian(*args, **kwargs)
-
-
-"""
-# --------------------------------------------
-# degradation models
-# --------------------------------------------
-"""
-
-
-def bicubic_degradation(x, sf=3):
-    '''
-    Args:
-        x: HxWxC image, [0, 1]
-        sf: down-scale factor
-    Return:
-        bicubicly downsampled LR image
-    '''
-    x = util.imresize_np(x, scale=1 / sf)
-    return x
-
-
-def srmd_degradation(x, k, sf=3):
-    ''' blur + bicubic downsampling
-    Args:
-        x: HxWxC image, [0, 1]
-        k: hxw, double
-        sf: down-scale factor
-    Return:
-        downsampled LR image
-    Reference:
-        @inproceedings{zhang2018learning,
-          title={Learning a single convolutional super-resolution network for multiple degradations},
-          author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
-          booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
-          pages={3262--3271},
-          year={2018}
-        }
-    '''
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')  # 'nearest' | 'mirror'
-    x = bicubic_degradation(x, sf=sf)
-    return x
-
-
-def dpsr_degradation(x, k, sf=3):
-    ''' bicubic downsampling + blur
-    Args:
-        x: HxWxC image, [0, 1]
-        k: hxw, double
-        sf: down-scale factor
-    Return:
-        downsampled LR image
-    Reference:
-        @inproceedings{zhang2019deep,
-          title={Deep Plug-and-Play Super-Resolution for Arbitrary Blur Kernels},
-          author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
-          booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
-          pages={1671--1681},
-          year={2019}
-        }
-    '''
-    x = bicubic_degradation(x, sf=sf)
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
-    return x
-
-
-def classical_degradation(x, k, sf=3):
-    ''' blur + downsampling
-    Args:
-        x: HxWxC image, [0, 1]/[0, 255]
-        k: hxw, double
-        sf: down-scale factor
-    Return:
-        downsampled LR image
-    '''
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
-    # x = filters.correlate(x, np.expand_dims(np.flip(k), axis=2))
-    st = 0
-    return x[st::sf, st::sf, ...]
-
-
-def add_sharpening(img, weight=0.5, radius=50, threshold=10):
-    """USM sharpening. borrowed from real-ESRGAN
-    Input image: I; Blurry image: B.
-    1. K = I + weight * (I - B)
-    2. Mask = 1 if abs(I - B) > threshold, else: 0
-    3. Blur mask:
-    4. Out = Mask * K + (1 - Mask) * I
-    Args:
-        img (Numpy array): Input image, HWC, BGR; float32, [0, 1].
-        weight (float): Sharp weight. Default: 1.
-        radius (float): Kernel size of Gaussian blur. Default: 50.
-        threshold (int):
-    """
-    if radius % 2 == 0:
-        radius += 1
-    blur = cv2.GaussianBlur(img, (radius, radius), 0)
-    residual = img - blur
-    mask = np.abs(residual) * 255 > threshold
-    mask = mask.astype('float32')
-    soft_mask = cv2.GaussianBlur(mask, (radius, radius), 0)
-
-    K = img + weight * residual
-    K = np.clip(K, 0, 1)
-    return soft_mask * K + (1 - soft_mask) * img
-
-
-def add_blur(img, sf=4):
-    wd2 = 4.0 + sf
-    wd = 2.0 + 0.2 * sf
-
-    wd2 = wd2/4
-    wd = wd/4
-
-    if random.random() < 0.5:
-        l1 = wd2 * random.random()
-        l2 = wd2 * random.random()
-        k = anisotropic_Gaussian(ksize=random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2)
-    else:
-        k = fspecial('gaussian', random.randint(2, 4) + 3, wd * random.random())
-    img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode='mirror')
-
-    return img
-
-
-def add_resize(img, sf=4):
-    rnum = np.random.rand()
-    if rnum > 0.8:  # up
-        sf1 = random.uniform(1, 2)
-    elif rnum < 0.7:  # down
-        sf1 = random.uniform(0.5 / sf, 1)
-    else:
-        sf1 = 1.0
-    img = cv2.resize(img, (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])), interpolation=random.choice([1, 2, 3]))
-    img = np.clip(img, 0.0, 1.0)
-
-    return img
-
-
-# def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
-#     noise_level = random.randint(noise_level1, noise_level2)
-#     rnum = np.random.rand()
-#     if rnum > 0.6:  # add color Gaussian noise
-#         img += np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
-#     elif rnum < 0.4:  # add grayscale Gaussian noise
-#         img += np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
-#     else:  # add  noise
-#         L = noise_level2 / 255.
-#         D = np.diag(np.random.rand(3))
-#         U = orth(np.random.rand(3, 3))
-#         conv = np.dot(np.dot(np.transpose(U), D), U)
-#         img += np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
-#     img = np.clip(img, 0.0, 1.0)
-#     return img
-
-def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
-    noise_level = random.randint(noise_level1, noise_level2)
-    rnum = np.random.rand()
-    if rnum > 0.6:  # add color Gaussian noise
-        img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
-    elif rnum < 0.4:  # add grayscale Gaussian noise
-        img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
-    else:  # add  noise
-        L = noise_level2 / 255.
-        D = np.diag(np.random.rand(3))
-        U = orth(np.random.rand(3, 3))
-        conv = np.dot(np.dot(np.transpose(U), D), U)
-        img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
-    img = np.clip(img, 0.0, 1.0)
-    return img
-
-
-def add_speckle_noise(img, noise_level1=2, noise_level2=25):
-    noise_level = random.randint(noise_level1, noise_level2)
-    img = np.clip(img, 0.0, 1.0)
-    rnum = random.random()
-    if rnum > 0.6:
-        img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
-    elif rnum < 0.4:
-        img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
-    else:
-        L = noise_level2 / 255.
-        D = np.diag(np.random.rand(3))
-        U = orth(np.random.rand(3, 3))
-        conv = np.dot(np.dot(np.transpose(U), D), U)
-        img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
-    img = np.clip(img, 0.0, 1.0)
-    return img
-
-
-def add_Poisson_noise(img):
-    img = np.clip((img * 255.0).round(), 0, 255) / 255.
-    vals = 10 ** (2 * random.random() + 2.0)  # [2, 4]
-    if random.random() < 0.5:
-        img = np.random.poisson(img * vals).astype(np.float32) / vals
-    else:
-        img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114])
-        img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.
-        noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray
-        img += noise_gray[:, :, np.newaxis]
-    img = np.clip(img, 0.0, 1.0)
-    return img
-
-
-def add_JPEG_noise(img):
-    quality_factor = random.randint(80, 95)
-    img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR)
-    result, encimg = cv2.imencode('.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
-    img = cv2.imdecode(encimg, 1)
-    img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB)
-    return img
-
-
-def random_crop(lq, hq, sf=4, lq_patchsize=64):
-    h, w = lq.shape[:2]
-    rnd_h = random.randint(0, h - lq_patchsize)
-    rnd_w = random.randint(0, w - lq_patchsize)
-    lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :]
-
-    rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf)
-    hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize * sf, :]
-    return lq, hq
-
-
-def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
-    """
-    This is the degradation model of BSRGAN from the paper
-    "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
-    ----------
-    img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf)
-    sf: scale factor
-    isp_model: camera ISP model
-    Returns
-    -------
-    img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
-    hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
-    """
-    isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
-    sf_ori = sf
-
-    h1, w1 = img.shape[:2]
-    img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
-    h, w = img.shape[:2]
-
-    if h < lq_patchsize * sf or w < lq_patchsize * sf:
-        raise ValueError(f'img size ({h1}X{w1}) is too small!')
-
-    hq = img.copy()
-
-    if sf == 4 and random.random() < scale2_prob:  # downsample1
-        if np.random.rand() < 0.5:
-            img = cv2.resize(img, (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
-                             interpolation=random.choice([1, 2, 3]))
-        else:
-            img = util.imresize_np(img, 1 / 2, True)
-        img = np.clip(img, 0.0, 1.0)
-        sf = 2
-
-    shuffle_order = random.sample(range(7), 7)
-    idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
-    if idx1 > idx2:  # keep downsample3 last
-        shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
-
-    for i in shuffle_order:
-
-        if i == 0:
-            img = add_blur(img, sf=sf)
-
-        elif i == 1:
-            img = add_blur(img, sf=sf)
-
-        elif i == 2:
-            a, b = img.shape[1], img.shape[0]
-            # downsample2
-            if random.random() < 0.75:
-                sf1 = random.uniform(1, 2 * sf)
-                img = cv2.resize(img, (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
-                                 interpolation=random.choice([1, 2, 3]))
-            else:
-                k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
-                k_shifted = shift_pixel(k, sf)
-                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
-                img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode='mirror')
-                img = img[0::sf, 0::sf, ...]  # nearest downsampling
-            img = np.clip(img, 0.0, 1.0)
-
-        elif i == 3:
-            # downsample3
-            img = cv2.resize(img, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
-            img = np.clip(img, 0.0, 1.0)
-
-        elif i == 4:
-            # add Gaussian noise
-            img = add_Gaussian_noise(img, noise_level1=2, noise_level2=8)
-
-        elif i == 5:
-            # add JPEG noise
-            if random.random() < jpeg_prob:
-                img = add_JPEG_noise(img)
-
-        elif i == 6:
-            # add processed camera sensor noise
-            if random.random() < isp_prob and isp_model is not None:
-                with torch.no_grad():
-                    img, hq = isp_model.forward(img.copy(), hq)
-
-    # add final JPEG compression noise
-    img = add_JPEG_noise(img)
-
-    # random crop
-    img, hq = random_crop(img, hq, sf_ori, lq_patchsize)
-
-    return img, hq
-
-
-# todo no isp_model?
-def degradation_bsrgan_variant(image, sf=4, isp_model=None):
-    """
-    This is the degradation model of BSRGAN from the paper
-    "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
-    ----------
-    sf: scale factor
-    isp_model: camera ISP model
-    Returns
-    -------
-    img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
-    hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
-    """
-    image = util.uint2single(image)
-    isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
-    sf_ori = sf
-
-    h1, w1 = image.shape[:2]
-    image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
-    h, w = image.shape[:2]
-
-    hq = image.copy()
-
-    if sf == 4 and random.random() < scale2_prob:  # downsample1
-        if np.random.rand() < 0.5:
-            image = cv2.resize(image, (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
-                               interpolation=random.choice([1, 2, 3]))
-        else:
-            image = util.imresize_np(image, 1 / 2, True)
-        image = np.clip(image, 0.0, 1.0)
-        sf = 2
-
-    shuffle_order = random.sample(range(7), 7)
-    idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
-    if idx1 > idx2:  # keep downsample3 last
-        shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
-
-    for i in shuffle_order:
-
-        if i == 0:
-            image = add_blur(image, sf=sf)
-
-        # elif i == 1:
-        #     image = add_blur(image, sf=sf)
-
-        if i == 0:
-            pass
-
-        elif i == 2:
-            a, b = image.shape[1], image.shape[0]
-            # downsample2
-            if random.random() < 0.8:
-                sf1 = random.uniform(1, 2 * sf)
-                image = cv2.resize(image, (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])),
-                                   interpolation=random.choice([1, 2, 3]))
-            else:
-                k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
-                k_shifted = shift_pixel(k, sf)
-                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
-                image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode='mirror')
-                image = image[0::sf, 0::sf, ...]  # nearest downsampling
-
-            image = np.clip(image, 0.0, 1.0)
-
-        elif i == 3:
-            # downsample3
-            image = cv2.resize(image, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
-            image = np.clip(image, 0.0, 1.0)
-
-        elif i == 4:
-            # add Gaussian noise
-            image = add_Gaussian_noise(image, noise_level1=1, noise_level2=2)
-
-        elif i == 5:
-            # add JPEG noise
-            if random.random() < jpeg_prob:
-                image = add_JPEG_noise(image)
-        #
-        # elif i == 6:
-        #     # add processed camera sensor noise
-        #     if random.random() < isp_prob and isp_model is not None:
-        #         with torch.no_grad():
-        #             img, hq = isp_model.forward(img.copy(), hq)
-
-    # add final JPEG compression noise
-    image = add_JPEG_noise(image)
-    image = util.single2uint(image)
-    example = {"image": image}
-    return example
-
-
-
-
-if __name__ == '__main__':
-    print("hey")
-    img = util.imread_uint('utils/test.png', 3)
-    img = img[:448, :448]
-    h = img.shape[0] // 4
-    print("resizing to", h)
-    sf = 4
-    deg_fn = partial(degradation_bsrgan_variant, sf=sf)
-    for i in range(20):
-        print(i)
-        img_hq = img
-        img_lq = deg_fn(img)["image"]
-        img_hq, img_lq = util.uint2single(img_hq), util.uint2single(img_lq)
-        print(img_lq)
-        img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img_hq)["image"]
-        print(img_lq.shape)
-        print("bicubic", img_lq_bicubic.shape)
-        print(img_hq.shape)
-        lq_nearest = cv2.resize(util.single2uint(img_lq), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
-                                interpolation=0)
-        lq_bicubic_nearest = cv2.resize(util.single2uint(img_lq_bicubic),
-                                        (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
-                                        interpolation=0)
-        img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
-        util.imsave(img_concat, str(i) + '.png')
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/utils/test.png b/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/utils/test.png
deleted file mode 100644
index 4249b43de0f22707758d13c240268a401642f6e6..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 441072
zcmWh!c|6nqAO8$7B{n3LV`kK(93v(n=FF9&gWOr7x#ec=DLIy6$XOP(=y2x<5$5{3
zs+mc-V`-Qp{Pz3DAA5K__ISMae!rgQE7jW4_~_x2hXDXMYHEV90RS#N006atxj3JE
zF4jW;AOJAMT(%1<xV7h)fw)POC-kfolvQHqR-suFm2ch@PA!sVR~5w&)w^F9S0K~U
zU#0c9q8uPcbg0$j$oq}tN>vnml1{bTxP?g+DiynQo9o!I6N_%E*vbgZuO|L|mjk7P
zI+d=K`&W>AKZIh#!o$NOBX`NMJA*)>jW^|y3Q#;Aq4n&kr^<VWWpM*cuGfbt2N5WI
zshPR!>~q#OBBtfvCT(8H#W{9o?KF0OXT!$_mv{Kc%5DquBFg3b@sO7_q?^dupWPXl
z54e1i%uFqg$z=NZ`PI>IX={rkWUC^bXM^*czmHU$U0g`pQ7yUKjc+^zLamVJ`t&iC
zhXDc@z;14{=4mUN9YVU<+VqJhq?`3MyZ|P+*|}<BxfTbH(t1YyUB`%Na;_r&S`5rH
zyEZo7SSh^L*Vm_Al#6~5+~mY$FV6<vd@7>Zzzq~wlF8)L?v){TxVRY055O3&vbrg{
zA{o<(b&h;RX>9lo!|;7Uqfqe5%F4|tQh4Ef-*!PDFMfB=nY|a|vb(S<<#G>;$qqX2
zIe;GfzRJ$OsO?f{*~dj#N(O_&niw&AvlF|Go5O4z(*ri6szhcjMxh^?P<HoXQ0nT(
zpz=_to^-R2WM#)>*8(MDie??6!N&){dv4x%IdQ+0(SPrz81#ezRI<%+xlBmx>e#T6
zUq7hrDyIByUXJI@r^JW(+`^n|0)2ph+o1p$0<MY*4Go1y#O(apWgLukXGqf}CHkq8
zpyHa%&3y;9w>O!!J-dAZDp@>Hi=#!fPK;CSaCn+CZSTJ0g!<}JmE`;e5Cp(i=ACVn
zB_^PtC~nSu#5ZmKw0!9DQ-eUj&+$%Uey#fQ60p2dp@#vyGPgUkqaQj<4;mnkq!R4<
z>0nSsT}EGEo)t@b(3Uh8K9?OV;3idhuuhvts2cgzpt(RGK#DQZZ((n1ihdE6u>jy#
zeGPt!1cma2s@ogNa|Qa_;wYcVy~Rb&)3N_T$+2w4TKG<0y~D(KvR1Cp1}_5BlREYl
z?>K>@efNTET9Ev0!oIJP54PB})&n6njk2EAfA?iq^ozsjoRPZ$-Fuq%Az8T?dr&4J
zSr9Ab0g<lbIoW0Vdm8&J7vOLE6h~@1_u_UXIGM-W3~VAgaTwJ9j6fKYgn&pxbV)EM
z3j$3A0a-H|q6b`rMEtZb;Y%mAATV$41@+3!0dgNpO$kljh}cWve7m2EPFcRpFV%w|
ze*0Th`(v>vr8|hg#PRPNJDi*8$MoBXp|R<~5E&U6`0(0U>wh5lkAQ$IP>&=ijvyI#
zQ)1@f@Xt9OJwA9KpS-+0CNMPdr&O>%+(=Ikh6VmLF$Zb2b=Ud@+PW8ZYagl1g}ck3
z_yG9_Kl_|+B1~=6)ls2bXKXK5JNPjBjjA}0S7O*=Ogq(lq#!VmHANHemFTXi_};?Q
z;)N4_)pH^5h{?F~`FDrw$jAVPPa|wrY|I)M%-t6D)WJGgm+o7qdAQr_Dz6!G&DYip
zJMQo>XoUW=<I#w=)ZE=X>gyV*V{1)TMb6I7)Zh1;=)M}Eu`w|bjoKo;jTG9o9M<P&
z-jr37PUP&Av5(W3jY5%5w1Xx2+H#}sJS&wjbNeIRmr^sbIGSSVDV#O#u-u&SW0j?P
z<jN?3YPxlvJhCXg|3i8+T4_X0DQNVkiqXf1eX~THoqyY`S~`PaY}Qc2nF@FKM)nsw
zQs$!>E-o(6?T!?o<;L0zbKwDO9L*ayGU~X@-c8024k|S-(`b>%6F?fQ<kboMATf(X
zr?AH>o489W-9&-+-!H-tS@S~D7)(emDeqNfUd4%5MoCwY7A%P;gVN*-QiV5V%)Acg
zGI4HRwacrS<e+f~ux+kd-rZ=fS?$(^wIlCjbz(fy>gw3LE7!`Sbc)ETAXia=^S2;v
z{nYX35JwABdK)s8$}%?*Oa`YWrS2|dv>O5G(-`p$Kmw3?@o$B)G2CDeHHE{!(L)3<
z!FTv<4G0e1-Q2&gLa1*hmSg{A9K2=kPsHv`nD#oeX&VnP#IM2iyL~A_jM#%q@TpR(
z@YXlW&j`6;jM_Js<W`$&ol{M5Fc4I$I9>*SG5%ub)x~6RcY|qwS>tCRBTS-6V#d-F
z8*KTw19N4|js9uRam^hLS9k#{{q~(ATa6%<-z~fYysr7aHhES>Ru#T5G}TxQ0H}F{
zE%JaFyOok{n20yL428BqGjsc2*I5EYk<-GLdHh{@M%@gaK)`LI{Q}Pl#M_`>K0yI0
ziI58Vc&&;)^(KTtCO5zYIxqh&cM2;O;=8ZxpLRBJl*(MC7uY{~ciQM&tzur#6{6(x
zqkwYA^$@p0G7+&+VlKclXQ|lUGnxe<B@!FBKrk6C5&p*wBC7aPkDFdy%+l1#l|8KG
ziAA0Ie(xn0OhM+;@V$DsQIu5@<$CBH1UH}d173g&)s&m3|L>v}0M9+a<?rrRn+3+_
z%2`p=^{`me#lZ0IZzisV>M5dipA{kGc>L?eyROxZFEvh0F<L(C_K!_Ul(hmdeRp{{
zALfFdrkOYd$_;^lin-x{tl{t>4Bx-;UoyoB+(Z!(VuCERE9huC#1EW%2;_IfrHa}9
z1+K*l5KIbIz(iESDV3(UZ?L&+#A>*|baTEpQ=Pvl|It*pvc0WjWu*baf^+*HU;J?O
zCm~YwBwwgJk33349ple^+a0Q5%gRQfM4+(QTZFJ+;?(yR3OF5L({PLn7_(G+^%sdI
z$QLR`19I~pnUNIrIm*jFc;zmjGrTZW?zqy(2PSPVhUO#p+`$Jq8`ywxnRF<UkBA9x
zxpjWGMb;hpnI6F`(v3wI4N|j-AI2poj|vCrJe#{)86r)$Gk&F>H#^l>siWIkV0qf@
zJ_<8ghg;wO_fLE9N{!Y%^AS5U5MF%Lh)Hv1OifXLN9nknw}Qjr9%&Atp}FOp7b{dp
zqime?Y-PV??rJL`<=}QW>^E}^#wIX@&1N^(dO8D>w;WG(nt*AzQ_+67pt=lcT`DWv
zhU-T(Z9IfROE+0l)cook%7bXT-p<-C2pS*uIknvQv_iSG0?s8v;*Lkn1bm}|Tm=sO
zDG)(5?21P_V@++!-RC@<94QobG=s1eb)GV&!YeX+tGuGq*p3~Y_ExcPHc+cb>4iD?
zWjQuI5%VRjIrM;Qw-&_3Wnwm>mip(a+hm;b?62wF+Kh5Iyq$U*Tj-YNE7;BzKQx?@
z=gl+-`!G%f!}Ig=RAji~E`Mm$dtPqR+3q`MnV6o)84b*XpA2$A?7tt~Ax=IN<Q*(x
zhD_ex-u{oazp=4#znN0EyAre9jC%WAjWcH6uzgp{d4UBL)MWvC-#K3(Chf29uLUWW
zMU}gTSw?*N4~$neC;zwr0AX`V66l}o=ge@?xkNAlLF~t;8s5NA&Bs8Qsgw95f;bfO
z-*(^tl08+2^KcuOWgL)Wsrq<jqeFNb1g{+r#K?7^UFDjvrVIkgH&N0y4GcHDs#Jhw
zR;X6zzdFe^gy^|x@d>17$DWwjh?vbm`D5{&R02=->sPXIk0W^ziEd?F0>N?xkfJvJ
ztEtSKI}tIP(eF!mfF&bfo;)8;GOZ5viC(`j^Imm@d#wL5v_JReF+dzY16IWVu43E|
zD<96yrDOHpVAZJ5+`EN=K0`*=N4l?CrDY->4W}wU#OR(V^H+lp7Yo_f#R0~;eA8H}
zJ~dHuRAT6A_>F7+L8$8!&2^n>=WKgTYfk7D&f8((0q@<uJJ4#Qo2xcrn6&1Jb2}IG
zxyXp9QM2F?Hw*eB66p{L5Y4=iNTLthf%r3R%uJNKUQ#xv2xszi_0-r~B+#%V^>=Q2
z|BMdL^9|3-q5ea|nL}gHfI@lbWjIE>qr2L}^|}wGyZe}iK=CVYzZ&)hqtgh4Dl3`+
zg3ZIJ-y@{U*g8htVJ4GQML89g3a_Rn4^RB+RD|qI_5+iXmCEKe4}S0fzjih&n{x_4
zFaVx)oBNYnlV3<0=i;J*n3s~@mnGfi#k<vRba|7>cl7U3D$bfZ4BRnTcVpAeb=8L@
zafoGeiv=r6t0>Hs(nLx%8R&WKN4un~g8880JHd{oK}u?_vG;bRV>FANDiyV=+8{lh
zCWdz-n#OT^e|{uD4!s%KjOaMa{h*r6q1AqM`IW1?EfgPV?^X02tS}S~HLVQRdS*#R
zaoF=6`*SbMgDi>mI9laN0$4?{@3${yr81iFO6#?w=Um@xRCt6L(sccZmM?8*yKjCY
z2DfWwzPd?gGny*%RwJWhTbUtzdSh{5YT7j6CEF3VTZ==cR*rusg)4ju&gJ4#J_66J
zgurZYC&iWE5S3EdcD32@2Nhaht;b3z<IUsad8F2l+RH<`d*conQ<d586d!(rmEO%e
zKhFwfM0dldEy39F@79tg)XrW(9(vc%WdiyRl$XcYKNNv6)*CG;*ZHm-aAx)Jnl~s8
z@>Y-=p~nr^`&~KOwC)?=({PcHe+msfS)ZUv%!1m8g0a64$exY8oud6U=|uFbO}S~V
zq#gn_ys@$};Sw7i9XVFwz2t2w3{RVKctz0wG=livL*ECA$_HxjVR(UHlm@pyHy@yW
zX+W2U2SZ4K+{^tQ=aex8YBTQ_1<Ab-@0VzUl9G_0#j{oWN2p&qI~4)VWJzVDCiDRq
zrGkPfUq!wcuVwDqU+00zyYu`}<cCnx(M<7JC{+^5gOqcVH~n|?9KsmpnqDYH_<EEs
zAYt>7^>a&2l6&Zr7ky{r+HNNLeWbBJf?L11ZHK1-+6khzS<sIT11TOL4@^EbJRIrL
zy5-CVe^76}{XtZ$g)tHBhfNgsItFjdQ@u+3?YiWQs30{MGRlF(=apYf6fvXrSNT*)
zXHha&&GggKGOzT?35_juC8E8Bp)!pp@iOmWQn<rVTTv9){NfuhfCqszi)Bmwqkd?-
z&d5EHI(bwgsE4K6Bxs=2fcIb;fWCW0`ccNCX>}Vq-VcLd$q~>8ryhb&aKGV27$KBl
z?O{i{{~fY4Pt3OIMWgZQtKVy`8^Yii|4@5rFi};eqDioZFVW*d8x%O0I9NH@h~1Ii
zkHo6lhT7Wm5NKBY-Qpf+pl~=!5|4(#1;w!jxt{`nX+8U8t;uF~7j-a)9DXy`Yhi&>
z@knoyA1xOJ6L}B=YlBx%MZh1%Nj5|QJuEO?*=vqjm=k_{&5R%FLkSS&4YtI*_%;31
zF2so)UKlvg%r35oU{cieMcpLJ@>h0slJg#A|LW-DTZwkmK;_SGFLb0jFj}LwZG854
zpJ1GVk3&=c>s4HC+~1`6O&eicT4N+VqPDgIoacg8nlp-ra?#2=I9iwZZcEYN{K%qq
zS6HiaQDGtQV`T-$VB-zQcNIjmVDK)$bFT6M0iDCa$x#Qxtw6NyrJ_2VK_};*YKtt%
zIT=c<cj95v9$#NJ<7iUSk!Gq&o_VT3+Hi21O4me{OH0&tsg^{6UDZTBp?-0NL+Qth
zCwuMeG7gJ|n!AppTVHf4DM{<ZuscS!%`sX=%M`Yt4){j@5FQgRyk%sd`u-x}%WUA)
zu@EN@J8P82DUaO6#&ji@mX9qhl@908?0eqS-)~lR23gxtmgxMulw*@!*7gQGYC|gC
zw8)jp2lxVn11ji<e+Ce;J{3Ws)Fq#pyT(?~2V7Un)A7E7)A|^qP~(wQe8h6O){_eJ
zLm;~SSe9+!-;dN6**KcfDaF_HlnSBarVZIDJ2cwz$O!64J~2+@)5iU#=GFtw@zQDy
z`#yIPz=u83#v2=q_%+n+gE0QaS||Onyiu+nzLcK~%Do6V0W=suM6efvVcJ@u`u0c7
z)#oDNTza-5C9Yy+hwf|q@o+{3@pulv?-^moBSYH|71><)W_BaHzyi_3ryyn#jQ@Zq
z%t<BGH@y-BtcX-Pbw<t7v{W)!*%YIw;v%ek3Vv13J<L^qbxv^FQX3Gtd7WHqPs>vh
zsfK;^UoMNJ9L8YYdjx(i(bQV<f4-%85nq2AT1fu)Ptmi||9VgAM?^B>wv_+7{K|`P
zp5Eg_GaTAwCQ6<i`XPwHV4Br!PhA#&1&`+zhvkycskz)Rs}ao0@-%`)O8#3DGZN&x
za|*Xif{$p)^jSmAc^;s0&Q<*5Ng1eg=E<z;&W0;g6XKNt<O36|so>P^klUIu!ra{P
zl_%p$&zd4nwVwwBDAsH!X&@!!H>F?B&deQphClOFrQP^a^erz~DWDKhWl&Q?zX#zf
zyA#JJa=C5t)6K0<j%cl1WPL)J?B*2cXE~q0;T-UkDgh0A>Nj#$3Jl5ZatYOkiRo#0
z`ujDD3`aR|gyqw_?qaAhdS(JmUS5z8kTz^|3YVsmD<^M=P*c|z#<oqbi1Ks8K?v9}
zjJc4vhGs|ib9<ZK0$b)KqtXzL=i@l=jarIN8q&kW-^b3`JgA)Vy^abA1qymas|Kku
z<2RmiP;hy}3^Ni4=Yk8nVeV0q2xa3Dtql8PbQ?3w2j*_k{`%?A*oX*p5ldVWura$D
z%w+HG?nW}5Faz`N!5dS~E{r9wPfO3-+qH_IZ-qkGD(PadrERq**}pj8(BND}W+7j!
z)BIZHOg$BH`etWm7q_?3;^iu8OLBnf<BRubI~&V);I3HPWeRh5bGC0{cTdM+MB8d1
znFCrp_o9D|?=B>|R<0T)V#^I2tIBy-*WzAAkOo=WMdgdZIt<^sH`jsNmWi(ecDV_J
zCNct!)RMJVOzIknX4K-!G;2WA-!U$ni4)l56v-sqGE-rlc@#-!J6QG20ChBrZt-aR
z?$E;R6E)nQ7PtYjw%g?%;iDpf>kqxWqrK>kRsEwkxo-1ibaSwZs$I;PY;gUP7vgL0
z+aF>!LuFJNE~;2oL>+XHGm3Pc*i1Py_SaqZUq?UBHVQ@Ao@$@$-WuT?VovKnuIac}
z$}BIO)5N#}o;yB4Rv$OE9(J;9LQo+qHS_DIF}0;3jq?6}$@KO)-c_toCm@*aTB#DI
z5>#!A$wqvR(@$&{ekUSkgy8?WGK6<!KARc!PboN!%@Xb3694X?QR8)@UmM@c+}|{y
znywK(b6*M;sJVTWcuA|)HT=?_YnrZu?Vn2x9<jm3C+_2+OJq&Sqoa0XlT@g185xsZ
z>l?`(BKXE@;p=82Zm6G{k2pK4Hu|CLK4|?@XL{N~S{r^rQMsSkIsBja9B<hvmAA?>
zdYzg4^%WO&oeE<Qb{Vn%{-_*(ifCp#<X1m02pZ5*1I9uaOxk`-lT+2in~wf5E!176
z0UzuEp#&YqC4fu^gWMHP@P`NlDhKJ^X`kLOxNuC=(){w)&^%B~c#beQ+Ln6a%q()A
z*r)cQAnprYKy>nP_3U%sKgA!6zsLyIBt7N^q45dAS+aR&Ww>5i=LK>7@qNR0B$@D1
z1)JY^c~r-E;)i|Y@=*x_1TQteud)mifp6$Ysn+ExJWIIG4g8sMWU8Ok<gu0n7b=qr
zK7BG@lnGE_7mJT(gONw@uO3Ztl&=pBR6%9k<@f=I1>P^;n221am>)XP->-Ky6SCag
zNXjk<N*A>12eL9jnMod#SK8qS5~)YhkO<*;gj9F^2QK}=PRy0)YLjdT{3K@th)YRR
zKg<{8%!<NfppR50IdD@ue$UHY@SCCNN6l^yzP*|7sg|u&>v}n+|LkjIRZZ7~uC6X$
z;nw=Posa$4@<xQifV6HDqV&w^)$xdoQ|FUEZdGhXYtU-1v7-<6cbU3TTh=}tCfj%S
zGU7e-bfh+=@)_8^yLKg#{?zlq=~(Bv`J8#2lVSaMw9~}MU77Z&$kEoRl58SJ?3N-X
znthi~{)FL00+`&B-qO^+9PWB4Hms$kmARXa_jXFDyqQU&Z0Z;>d~o(-ZzgtI57-Ak
zqz~3~qj%QVLR)uFK-tawD1da+&!WFJx{1CzqIOAFmm7w92rk{6O3-R%Fnm_Z8*z>}
z9HVY|V?6Tsk8ELBBdukHLjZ6%Ay8puc|k_dNq%TQVBT*>H?PTV|95W{-;#lS1HK$n
zg2rt8=av`+Ip(XQwtp6YxqaC5PF_e>S%ttM@8g74zFyWN;B9(?^5%Yfu~()X4TBM-
zo$+5CHEN3Uy(zTXjA0wgcH#ARq<NyWa&#MyAkZ21&#;qwGdRp-GyPjfKQ%=<WJYsu
zZ7p`Ic<obMsvLiN*#aCY!d3a`%+#7LoR<$(46H)zWeU&n!P>)}ApvPwL51b$4>cZX
zI9i!4qP%E-C6q5OBy(Pr?66GNF17^s@Yl=Q_-|ltUzmaEAi@A_`Td23(Ttc$b5IsO
zf;lJbQA&zCtND0IXPn|;D-6e&5!K(HdhC8`H66FE^7`7nNH?*^pPvl(>Rq!|=bA6L
zo%i4FSj5O(1p)>Wg#2Ekaa>G;?*~&inynGbs)}K=n1KU8ZzrWj$HC0dhKtAlx;md4
zyO|@0R+k&cPHI&}H!~(2nH_WtkKt(cED(JYpPJnn1q76chQ53L3u|)5++>t)ed&8=
z*cmRHD@d6VNZiFEj`$Qf`bGBb+*jK}Dn^W2I>%I5K#ZoRBUV4?c{x(zgr(b|ZP{VH
zvm9Tgz_NLR@<=N<4LT?&E4i*vPcqPuv`h@>z;i#$J*A03g~EPfuu^ys8d}1Q#(yW|
z2#fJZYk`q!PZPn4oxz#1<=#ew<d@!39jb5$Z^Bq)6T@$}Hit;w&JJ`d2S|kKjqgVG
zBW|yqy1lU7tg|9Va#)M*f|P4K1v%k`IS5nxR0v^mG-|ZP50{%yI$R9B60a7o`#kK8
z+U@AWCK9smtwwG?wdaG+Q9<agVe936$^gglTDaN+xoj_4JpSF`$h0MeyG?s=O`Vgq
zEH^SaTJv}oCqc>ms{i=HlbKaYP2VgWPT1O5zK$i8r;@V%1UvtZcs3uNSMKL;CSd;p
zeAsGaH1dE|bRdye(7fvLwU*Lc*EhQzrIUYmLD{cvd490<nbK*~cZz*1*2p{aMEt^X
zW8Trw+<8+D#y%!ZIIyFI2pjp+_92tLb@dJeiJ_q0NymeojT&=Lh1tc$|NFkC#D*mf
zNA6C!Mq7yfaE&%)<X1Uv<VV?K>F%+rTK{SF2MugTX_@xQtSwR~v~ust7Tm75Z1Rq^
zYeor$Gf+;_O>eo_9_mC8ukeEc)~$D2j!J@uB8Boavbj|rCYE0q&``f(T3)d}T-VtB
zV|iMCVUAL>(o&-Xhyxavw&I7ZRBS}~F}Jyb7A{O`zd*d8vJ%ZH>X<<}Q!~>ugWFLz
zGyiO?Ebr24R@Jj0woFL@!E%|eQaoZjq8g#&7t*pUS>bu7;Y(#z>>A%DH`u{_@VWFK
z9U=9LU@w{VB<x%U?<OptZqN`%2>1kbOM~h!L3C4wbVrYlKT0Kiz9qCT%q0o^SKh#f
zU$`$_gwoT-+uK{H17|RK<%`Vyd0j5o>}&r1dI+H?RXP4Q`z{LdiTiQ@T=_Wvp<WL{
zwfM}XY2#BD%#XpKP_AP|)X_;`C^&5x8mIpDi?yHAodNV&mYw|f(|0A4eNKf?aAzhN
zxTfPJwob=QPo!cx{a;?Lk~=(Di=!MZ%^K;n!@N^+Hs!C*SPMoU&^WI#mfgfa^(;v0
zjXfFG(}c#qx*yUu6ps=Ka4!G%W41yF*Ua5kOK0PnoEa_*XXshLOT!((6_mq6#UFl{
zUC*B$B$ldq$aNa~CD&9_PX1(cS_wEah!|{_WL=cJ!zI1|OX0*eU+809;*t=<V-iw_
zH8Tb_@bgr>rmw2Z45H6&4q24rIUt8RRa;Io;Cm=|e^f~8Lk?hc2D^Gv;D<^)IosB<
zEQ9Z_SZ;qnnd{K=j-Nvu<q61xx1_jQ3<MPHHuOG%?DwXLU{Y#sYH?^QgTn;1tVURU
z54p<`a#pKvJl3yud$pUiQ#FItUIEFLn5Y)Tk&oH8ym1x&{_W`=ObXv4w{oo=uhlu_
z8H3DWV>JX^V(+_n+4xESBIyfY0ipn42gPIlYWxmKyXtcV***E58Hq%{_<*Ce_{!ZG
z^~;pZyUDD{5CpDrsOVr$-`zrEAE3AyH7vx4zV5h8ImeRdAK=8Evw`6ejj%tBzOg$a
zMGihWWY%mTClo!!btqYEXRG=(j?%p#X0NPS*f$b{Od>hFs<a(I&rdpBK}P>uk2hiO
z9v$Y0O%CwW<FU4<Ui?yh;mP5o!Oc^f(Z^*lcZ@h7Ug3`0ZKP9w&(0<xE0)w|bLMlR
z85|MW4kvq_my2c7$(e3J#57w8kGVv=aKgRgdLLo6Q*4|MWvnqn#estNs*EFwOpmbM
z6F#=KHn+A`Esq<tEhd|+o}310|Ej*~MVFZ`u$@C7=MntS$6%F|W~D#3-PaQOj;<U7
zw%#ei?^U9+;&4pPP`lzD-rUmM(n7Ba4NjX#(~_YE+#)Ii=@$TrtW0H3Qu#Y7>tjK0
zHVAfx!4bkmIx!BGEb(KRnLH=_Ch|!o5U$VFU=u-zuCg#M4Uzh(xkmoQFQV1_<BFtf
zUIe_LGN18jcK2QPxur`VMy|dT8a-rfz<0u0s0(t;>0CoYzVSvNA75yQn@oA8SD__2
zLt1C^O&u*H4QhC1Ui8qtG^jxaA)DAeR9D9#_veXS;wo=R7aN*7w8;l^u{#D#NvNP~
z!DYLvAN+!T#M+Cs_Pc}e#c$>S@#tfcxQj9((%fQ~zs&Z><&sW7fleyua>|!8Je@JU
zXF6(C%%2#I#8HmYPhIeY0a=LZR})=0$2^zYy0fYzp#-x6i2(ZI%JN3v{IQZ-1LSbx
zi1yp(Dz4{kO|R7@>*b6Pla_1q8cC{LDTM;oH3{*D@+|~h!C%B1&CK<wpC<!jBEn;K
zwiai9Zp`-O=#E7^Ha7vMl^Z3l$_tqDw`FyzRVdWUK%q%H`9M}8#86n}KbSptfB`q)
z4+D(g<+;=a%{3V{$&v&<z}A=x3Vm8%lB^{qe18Uk0Z&sA`uf?4^S2VeC`#!`mSvO}
zoYV3^g^s*@T=H8Z)E*V8{_vr=e0ApfOC%`3OtIjd^v9|O!RQ564={R;ED>=u2<6V>
zF2?tg!XG4YNa$1NCt=k4%AlFqkDU_VLLe}N4434Eh-D8AYxp1<`f#=Xvd4^)J}X?O
z$SR~NvZ?L@_$uApSo`7Hs#Ku_5R5qu|5kVIfg=Yf8rOBY!~>{@K5{|MYrLsx-0f&^
zXYcOpbGX^{F(GN4OOrWTU9k27+tCYQ0%yo0NdJcMp4H8rot@3i@yLVq#gP;tX)~mi
zl@(C^h8;Fwp^gbyjnR5G!*X~!qIQl@6}!(Wirw3o7<Bn>WCZ=&z|_W!baSTJd;|f1
zk^QoBO{-?y^JaOt+Z-pzq{KD!v$T!w%oPN^yzujk_A|?QR?n@2zw^3xh#b48>-fFp
z&CN}*2N?xHZAaXQO$;V56d4;EYt>Nv7@U7|z|h{9Iq}Nb&((KfDB@Ik5E6OXUFU_i
zT^;V3f9*Z&1D*zxfr>h*>3l&7Wwkk}T<^xH9o`V};+DLzR#boDFR2Lh&i!ghk>vl+
zA_<*N)hD^+1f^6#7(&B9ombQT(a#tcCXraNsUj*0`V<GSaji~O&FcC*cpWXx++{Tj
ztgB8YSTHChDf3cJ!NXhUb392x7rs8q;+}cH-7d*whb+9TsTmi(;&txlTUBmBmOKz-
zjq4Z_N|Bd=;%K!TzUl{x?>dFHu21Ne^f&`ceyNyDEF++!@}JHKEkK%*<+f>{lOqyn
zJc*p`e*XW*zZkspch+a9>*~OKxTz`ND&RDs?jHg#lvjzYtl5~NKZ1}sy^a%;lK)%|
ztYUHZO;UbbC28NQndbG+<>FsE)3YWi<0==jYvjadH~mBH@N2bwRbHOO>2$$LSv4g=
zJkJ+_u1@sZCYE@#<6dp66VuO8(jutNoS&6QjcRhJdi?FgivHg;=iqz1w;!}cwNm`5
z?3<n`_1#6!ciwEHr})5mMRP7Ezwu~pMQy1S5tZF?*PcNv`ZHvgwiYDYJJRO@N~IGj
z5+3)xQ|u^oJRV}e=_bSxN!%iF33;T3ykx;l5<v9+svrR(eSXycu`b5C7+;Rn8P&U;
zrHD55N)j~Wfu@?6dVqQUgUPh;sXvGK!?k2ogk|3=r=03Y`ZhU^GY0Hw5z?~jZt4DK
zrs*K3jz<YpnG$g=Dm*j}^yyY)Xi$*<=}fBcsxp^#fM#wnUqtFb(E&%@3mxqEajmL$
zt6DT9n&^`t{J{eDKU4m^%T`RS!^w5!jEMY~QnDA8O9u}h%dh2A(ETsFykc}p@>$ZY
zF}e?pNej{G*BdgXEvK6Z^15yn{{gkNExIgd1^c^YLBz%#B9~1*Qv1{_cBQ!3*+E8~
z1w>NUND^VU#n`+{99MWJlvewQ;NVjk(R>Yym@8nl-~ekg<cp~MA^KiK^Zw3q6;;)O
z<+A_x%DOnmvs@oH=so!Rx4o0mXb0Sw*yzhqV#J;f`2J=&qUn*!4kt2TudibguSVOe
zC)ax3)a1A{!kts^6o;iL%e=sd+Gocjl*)wGmgYChE5UiKyX(Vr$5(`E#rF-uBB=QS
zyNwvmefMFWtDLC&5tud{snn2N!097&i#NV8(Y8QmBU#$AGb{J2YQ`N%97-t-cAg7A
zZ+ku<;PjOXM$(nOhdWJ)7Pe|If@bn+2(!7VDR0bSSKPNN^)rt$M&-xN5PcrwOO|gI
z#*}qRA&>_qmgq0H9zhO=@_A9h|4unbOF}n5RW(?k1s6#P$&)A9&}ft?Z~8<?5Q<0K
z`zj_$&Y`>bvFz_@wR0>r5fSBb#k*n<2?~=Y2vE6z33do$N!y~btY!|Vd>V9F-z@-z
z@oKKnw?v<Ii(jG-WuBM}QOmLm{50JoMdTZSN&br?XyMs9)F*fk`IzDd1MQfwKYVyV
zCpmfYihY@f+-)t13lfGZFK(I1lq2p6?Do0r2R6|S>$6Wlxm?vyorELe!=ws@t9kR=
zyUf;5_7EE`6}sqhART+y=LUGN#jWUSFt?@}YvF-ZEntgMKdL1NQT%H-nfi4ULZ9qO
zzmaU<DdL<yToVdcivZsMpr$nmZ}H4yetw#IOZF{MYm|{|M-ki}3OCF^uIt(VDQbe!
z-Ij_4zdd8LP9FTrVMmXQ1P&0LaJd;yDK*-&(@XOe0;Th07Bgo5?oG?`?hluFFy@^J
zUCIyhzkQN4@3)X7NR&>M8a@Xfxd{6~Dx^U!Id>*+YQ`HRJOG@IO|Hc;lWds4OX(Y2
zu)MtVG`;EKB@Z5@-&DmCQNk`)I^iS+k^V*ibk*Y1v)qixstqkISR)KPS1?JLSOua5
zf+nV9OF;w)>y(O<aOk^>FgF6wffIBE!%Q=094}hClEl8qsJtH%_g+X(|LsK(xD8GZ
zOpMl}sGGux71`NAFE{#mg<oz?HRW>}EBg0q#x<JhZ$?xvS=_-s{qVd13h*0E&}f|G
z+|ApS*dTg><LESA<2HaRTaPTS@Zwn?0GT5dC&DzZBoLF0|M6>K6b12*F+)ZLX;pqz
zKwGDq&!e=W>>xTjy2?Z}V&{x7^2Pl8eD*?Ai@9wgujH*O1yIl;_{zE@rG^vVFFffI
zUwbW&%<1za<>*8(B_#&u$<y2bKFK~(qWf|h2;V@?j{g=B8S7b66th#2{f@cY_=df!
zPyh*VsuNF>$`j?3(&h_-Qp4c`VARE;jIEb!_QaPYckEbJkm|(vE7EL1mpFU(()@41
z<vl6Y=egHXy__b#l;@RK)Fq1>MWq_W<(6{<=!q=4Opg8+BpLA=#c3+~weIhP=RE`u
zdKQ)=XA$k-eG6Ly%t<dnSERqzGfr$%7tQeTsg$)l<lBWAQA^U}rb9mo*rha~q@_K4
z`i0;ra;D}BrihPAZ@llDco0a6w^Fj$KUc5$L`O#(Vqnzix;h{6V}i|q%>eq%Nf0q}
zY2gCqzs10a2rZ>~Qj*Wbze<>|=8>m%os)=e8hoc*kv`Wk*HQAwaD@gv<tPAOk`*Mk
z8vQHVtOu~fu4_(3N}D^`%<vH?<-SIpt8M1PQGwR*I8cm8K2#;ZjH>8=<1-&Tk-At7
zxzv7AFv|Iyx8uSD=-+*gV<vkQ88m<vy~W@$t1Qd#&A6?I75l;37WMrW;*?;u$J`%0
zz<JH^o6skZDXYf0>mNOb64!R{P86>YR6tb98O951r~l5Bl@3{cxv-ijDsvoSP%T)a
z{Infv<@O)F@n%Ya%zKt+jN3K;6@Q*P_#~n0nIuip4{Q6=&!Zw42Y+*D%RV6xp8BdP
z;LnGG)`P9ZzfmzU;ikwsElw-MnbGpJfM|_u7?b+i*z_G#2p<OJNmj?@aTOmz%uf>(
zzktob@edHGGG%Aqi<E?AD)P8p*Mlf)c4D4arnI+bJU_>M#3JQX{YgM3nP>8rBtXxt
z?<Dz(?acfP{AHtG+X?r6;~o6<SH>@*nqieEyp+Pnb>e8iN^?#5Ny{o_SVF!mTIwEd
zVNG%<%O;m|ad<vYL+<cGWaZ4-@#(*)CVJ;@FX8og%EfDVV52wWbE)ewk;d&W30c=O
zQL?GvZ{l-}RUxzq9j!a}qCI4jp6lqd2G8UOS$XC;yKt}~+HLs9&)O-E4~BYrs@L@c
zSe$Cu@`lvw(pR7opX#$bV*mK~aK<KrdTsCCoP{3h%?a|Y;iw=^$d_OVF@+yh`R~ZQ
z=iz00lX++fySBg*Ec1PeW8#-qO6x?`-?CJEjZ<VvJib}LM8B~6Pt9VU6)v!XgSjK+
z9*I5?l4w6Vw<?t3aP4TG#pK>{juP6c^<a<5zu5Y3u_#N=z77`|8Of>3a!965e_vEn
zbCV<KxNf>s6jiRCL%47pLR-JA#IYjx{%)}52L}gptcqGhN;odbn$KqLe|_5Y)~JmT
z3Z?c!ul69z9lN};nob@u9P6&`n~f*1mlX<*s?RH$js{oJMn+!z`bcLQbaV2!`g9#4
z!fgQgY>+&%%?ba9BDt#-PrLV`<xH_Et&c@Or586HIy%~@^CJ}-I$g@_a1!Xxr+1iK
z>AVI7ZoOdPIGxW&dBPC=u<1aD8QTZ~r^~7lUpD_lwElgI3#V7i^hoR5u6SPRfiLqH
zehPbPug-hO*6L>9dGC&;`{5Bg`zg$Fxl`hh+tf}-y|2^qf_F!wMkru>%C{day=HDM
zWs1%4V1r!+V(%<F^<U{l9z$J$&-wUJ>L_)!ihWm`*Inb|Vd);<=vpNjTjki!l;>Qj
z!YTfj6tDd}HH_J68;9wA5fA%!s<CAylGw&?Y-XBBZ>}l4BJb{w(Z4Rhs*qObmd&@Y
z|Cy!6YTYh6pp7d$hDtT6Y7}$N@w|5fWCKGbB%&k=ee~deG(QSJ`m=IBQMGxGU;6K|
zgk*o)<TC<j^xjq+DxcK78yJi^q=(PO%bl~;v4ZIt*synXiqivv?su2!nG<cWr~U7h
zCt{}S9QTwCW;qG(8v{u^hrw)jY<Ek?{#)yoj@jAY?HjQTPVxB4W<+mo)UhV2s#^Cp
z^ZkX9hs(=V+J&LH3CXj4YxS86zIfjS9f=cPdQyJI@id{EjsWY!xss;+qmtt2^KVfy
zo#tYD8jxZI@iTDcPRQpr2+17C7LIF@QgQ3iP70I+EvNu53UaNgMT1bsHSBHg2kODk
zH+&6W<LA$75@~!=aD_KW*>((WXy#4fJN&v5TfB7JgetE0Hw$_)P*x8PGl!cj7}t6%
zh$9MCI$<UQ3ri1@O;T+q@Nyu)foUo{7dAva8fb<*4WNX6i%q7Kp#|JyV_FjB_gh-7
zn4z8#QgeZ;PA9hi1?#w2O6xdH{Xg>Fv&UiDA8|LJfzN-0@RShj0MgV9JZvc=!zCe%
z#0a~=6&lPvg*D{hwjSku+wTI7iVK39j()vn$*GBz-wj0h`_xpVd)^EjVAE=RclI}4
zop`ylcb_(~yZAR)>)eQ%$otdWDdTw{F+JG%7rzQ-%z$a}J@Lhz>V!lIO-=V>+{L!6
zlIfBFy{}7+b@z2#_Wx+a{@d?naz;q<#~51eR!G`Z#L=^+q`8s6{dGF|?oG&Dh1p;S
zPFbGe?6TbQ`PRnla!%buonn;Ev!t6LxoD{#y-R9=<i&ryOg2rd_qXo94e8jO#d!zX
zgpy19z^&ffyi^;r?d?$A*zLcD{g$jW%*1j?@CwDs)A9vJeo`rsK9K_zZ0sC|IY)(s
zyU84WL~@*zL(5^lvzu$q6V1ecmysdYWwOK5FWhmQ7{A-w16(pXjBl$kH)i&NB`m`J
z>~+SA3Qc{QQa*G-77iYYU^X+}T!-GA`%ItURE`+*4{T-PPqimD<v2$Y`i(r7Tl`)9
zF<CvC9073jOTiT!_)1TZWK%+g4<+Qo`0kL7k;b2tOCI7}Ueb-E20jgE<EDZEx#K~c
zXZgKL+*GyrP*0J#p{l#S@z`*r2v(gVfUmY`M`-Hf2|x8QFfo+u=$DJvLIobGN=c~%
zCYyW=)bJ0gE4kX0_^m^m2XBIufg1lRs}R%_O+}i%OoZ>r45Cnr)|iO!aNaiB#`lQp
z>T{aU)5Hl2<bxj!kO2PmxM=BLuyB0>S_?08U-Bd?>nvBEtsUwC##!KIFVHQ!Gte^(
zK|aWl_TH8KHep~SeL}#SSE~FT4E*aF1!P6EB_<&gfSu%2SMlEeBATmwdbZzD8>r9K
zc3k5NZcv(Aofyuo&Q<K%<;8#NP13QPKx4PNb^j+bhgER+PETBjEmt}E;9rpCb{(T7
zO^{<VeDZo)@hck&X)E1_OIWsaGo8lT$4_O&zvR=@${FGP+CAPqUL$;#h!PeldHZ=8
z7N4XJx|d5jy5fC!LEkhkcTe*wTF3ule34Z0<-`sJ;do(Vxlh@s;#`_n%2fyH!)@7}
z@4kG5Vf}+__5I&-dQQ!dRAo@wh1!}`$M?}>lPy(dSyMPqd&A>jop7i|O@Wwcd^|M_
z(165SSlgm_^du{v>z!$z&V~73=Wd(ICkWWem^Kisdn-2fTAcfh)3yXn2ztDNx4|ZE
zQ)fo(=DrPQ;YkPy?_Z|B5XW7=F4eMYSIz=l;KvXy_eA5%Jv|^W(o~Q-)KBt6KYJRU
zM{ZDLsVXHF1l=q*EiY*DW}Jl1s?OfZMbGjOpnA^BIu=1l&kwb@5KiWUyX15psGq3R
zstpOk+i(gbR#wM}or)NVHPuy1s@v-0?8#<61L4;K0Z-NX)%we7?zg%)R(bbQi7d52
zPJXdsLXDprNF32_ZEa;wR4FMb4Js)CQt&N3njNPUwz9D?X4ju>yT3Xj)VYrAv6~y`
z@LM$5=I`z<P3?(F1ig;T*~f^IpNV$-LHKu)v+g490iGw5b3%@m#ClT(M*{N(S--4u
z$ef(Bl_8sY17k`4EEMrsY5od5RI*XAZ!t=+CYb+BPnh$7BL4@DIrM}`{FNPFO=wCa
zoe*@9EaiYNB6}R{?=WZgpn*NEHg6Cbl*tnR%EvXzt89nhXK{<7iHGgF4L%>`!x$L@
z7`t~R5v`nJ{Zz+PJ#!c8cqpvl)|}^k-C!tRcCUF_v;d&=BD)|fj5fXzQ&ofhI9uSd
z^uFx=D?PFM{|%3>C_7;-0qbT{cXc0{bxp-DPb5pNVYkH(D`hw;3E|bYp*!5c$~@m%
z&Dj1O<}+L<1wG0U<)RR~(KJ^u8nIEX!z=ti^>4?bBC$TvJxR7uZw1dtg}~%`woO_#
zQ?~YlwUUe$Bbt+i|D)Ppy0jmV@%BHD=Tq#H5%4WKBW<Ftpsl3gzUgm_KE`|+#_=c#
z9R-?(T+&?1WCK@sDKFs@m~lyNluRy29I(b@)*lKI>rw_zAFlPUXB#YX#p|i?l{Lu<
zA#!*MYR+c!_uq1))NtDr+8~KUfBC~HzUy<#N*rX2X<Di2XZ>wr9IS^P%rRrwO+`5@
zMN*a|*WzuS<M2CKMX~!0*^6}yXWfGxpS*I`qS_!!7TyV4s@HYjhF4wivH(IyejrM*
zuDk%%)Fhc3R1v8N7oScPRe7+q0y};w{jagzA-wE`mjtOMD3Ha=Ero)FX}+60mh=lU
zEGzt0(BDq^5sxiG+gy|pB?!O^c)+rqN&G!_S0UoW=M|z{3zSnOT)nqACAoPOwf|1t
zu;~de<k<gQ1d6?}6=y968Kd<uiGNzg@LhNRU4t^&1i$lHDb?tWp>h?JIZN#WW1Kcs
ztD|6(JM&30<=dL=sc4jWhRTlkYcm5VSeU?L^&0y$aDP9gNNI3zd9T)&z3cGllY|V{
zuRjZiP8cE{e#!o;t(4Qp8X2)gzQ{Hgjk)4xiGj`OM6|ZJWGxC5j)=ZKrjlbLv2ed>
zipj1J#qI6wHP?vAyN5EPO$JUwF}I(pq~%(YZDan}cYlLoP3K(O|NKyRq$|{tNFv`o
z95YKReOzJAuoGUjOmtH`GEgz@VD_La$oVNpkuqBk_BnjDs>*L-*%22~SWcdwZ{68*
zc{X_3U#MZag*l?Ox6f|nWRVqYvutPQLg=tLgTa_QXC<OyEMYWYOM#NU{78$4h%l=t
zcp7xuHK^3{COlWRk{kM{WL3%(-Y&t_7oJx&;gh*J&*J2c$0fbXrxj32eMDQq3X_*V
z;9*LNFOhOth;|_BV=9no)`L!Y*YOWrV2IHy<+?2WiknDyE}9I}`&~|WiLHEUbDu}u
z17n)HxYa~!$ry$fa|)2xLS3GYY)4&qkE5l+$Z3R$<;eOSZaiFHQtYw|oQuHOpxdo|
zbwY%#qZ7Ly<>F`aC-~-o)fMFD<c#vMizurG|K76DfXCTU0`EWR4=Pzn>$X6Ca4JjE
zWzVUKtD0SeHf<nyW7z)jLu+DoprEBF^s$N8!{9Wq)|zV~NH)0R;B2@XPYO>M@4iy|
zaZ}SkVN<PS2)STVQ9uDMf!+KQGv*-q??Zt;-w^<LTJj{Loo?T~7`6Xnm)U=hp`l@h
zT%k=-UR4MhbjJRfIoK`&rwz$IKqdx&O%i>dCUPTZI#-p=h4$JK{O|Bf9^*%;92TkQ
zmH8U1)hpczHoA%)B0=M*7EeB<l7<BbZjB|QxmBJtJ*l#|Ag`=8e=O>bQ^nc$Ff7Ub
z=_k|~0fhNo+QcBo)LY(Yxh}T-N_YPUbAN@gx0Vrm<0;zA$2_jYDs?<Q)25WOWq7Wy
zg8jq9_Pk6}7!O{OSdj1wB)+-|NjH3CVQ^CPu=Q(+Kv%eq@O$h2(PWa0GGKC;mm4A=
zAu!o?^b+<+2%m;6_vfz*mofZ25*e+YiG>R<FK<1v2;ob(JjXqGOY@|)vA>48BrXj!
zmB|MI8?Tp?TqYfXYmyo-UX;%?oC_CR^Jj9ao_VEg^`gLv+&5Ceev4B!n*ZfF*O9eJ
z$%y>7>g8d;#s6!S=XSC274B)~c{q|BZrNE)Uvg#&KDAB9>7_(>s9U3SYgOxiLKSW=
zVc-R4u(#U%4u37M8BijRcsfo@u&X#*P~{#smJ>)<daQC-WGlv{OP6wP36nc-+PGPk
zkOZnH%{%lI*D~jAGBXMN_vjOxw91*&+$NHrb6eDmYc0L~aGNy$QjenpXC*2`=3vII
zd>JLvZuVV%WCJy(@tSVn_U{9w<bb?h&aPp%IIVVLYGPxcpnIX@ySPClgMR5(6sKq5
zhn~0T6+f4wYGr%7heJ0xZ97hJfHNbvQ>0@~8blJ*eIC6}lPb9h-4y?Zr_@wrlZBKx
zWajF%oZ0N4ikg_cotS24dUG}>&Xk{SWZNk753>HP{p`-Hd!B7WoN`pWBvUG?sy#L_
zF%jZqAYh6SykXW*#SWp7k>u=N?cuCMpK{Hvg)-TCNo2aAO<)4<;Y$XFP`T63eFT6u
zrC_iQj?Csd2k2XB&~2~MOSR`PLd%61GX+nDj5ocGK2@AaQsvT-pBWSp%Oq%8aLNXz
zV>9y^(Q>=a#u#xDw`Pey5&Qy2srvt!=U)sGb_-_IQZ{zhc5^s^=*Wm_^3-O?E8I(q
zAWK`LndTKwl1|i4J^i{~ky&_z4)pO7%m{?!m=g|>Om2zyw+)tc;N!yo^0^iMC}&um
zhC8&iKlNFyJou|@ka;%a+t?$5^jmqNu<+lv-5{GnP0Pz|#MABy=<c`g-f-43CW^1i
z^?an7+A!&M6B|;YF_aJijs{6iANHGYd~Gx1k$?SATdbnhG|<Tt1@X9Hq<qy^XYyzX
zx*q;G!7cH4gzKMf8N+)zT|eaXFv@{HfBvrVC?e>7*d!$C6|0nV@o@`HxGH<6{~nk-
z-$`N|K6t>ZGb$Ue`@_|C`FYIw2nC1wcc6OJncAuSzsnnqtGw$?oZtF->~3A`Mhc_<
zN>;E04o}5om8St>_B~lA=EKdtxz}<Mr)bkL!9TES%|tw|-!;)P1^pF+l5>Xz$L3~d
zwe_Tdl23HyUC>jV^_PQ`7&|DPxiLh6w#TKc1E~bj(G+R)Exl=H;nS)9YH68$)^D5c
zw^wUPJQsCGv|?V8YNx(vsn);$t_LK1S#Mu6QN1E!TT(#y0$hB2d?qJQz8!(|l=}L}
z9t*elqWPN7GuXsS2JrwN{F>-yH20<Qv_I^ZazpOTltP>H=tXe~yI^a3yA+ETp1RzV
z=H=c0I;qFW!ak+a^sf!ag)u!0=T`Mch@2Asq4(lOhAVt_cKfHDWwh5Td%Dd`P7aI3
z+73i31-Y3eetQOS^<p+>Or>ma(r{X|Q>1-(Y;1<b6RE#GPs0mAP^bgNiI3|!F*o;9
zKlZ)NT<VR|m(Oh^v@fU*{b(5Zy83>iOMsEtoNGB#obi`aRQbvybt}{)vrPE)vV)Hm
zKe+-Dz;kYj$sv#)xAM#Hra|q#?e1QLRX8wldF31fK!s|~(#B=kgIbs=gGe#I{}<3H
zE5J1$&N637X4-S(=o>?3Nc5oX-I|q&<^LjsQm#4nJZ`G=E)gv<K*;&hYB}8m+h1vk
zU)&4lqv`wq=5_sO7gCfi-hqGjrjLiL$FW3(bs<>!V8Lg{xDp+N`J3&RmR8vzD;@<(
z$1VAxA!#K-^LUe9^y~U8GaZXTs_;djNIz&J^yzuAfIolsGgKm$>vp5p?>BKeuK5)$
z95EUbfo=D@D~q*E98r6inKxA%LaQ4#`U0PsX>3A(5^=bi3+g{_JUit7dVu@5rQDOw
zhE;a8jF!H1S(Ch;yTf@75y~cO7h%D$V1_<l0m3deg_bK}FJzxb@|noumB|bL?LJl_
zb9n5_n9;bw$NA~Dc7mp$d1Rt8!rhs);MyR?cg@^`sHqm+F8oB1DDLAE!jOnUpOe}6
z&52W5n2}yCwq}%#60Oa>zWG7QHTS7Hb$>&*fTtxpt-1$btgG02n=evMl6&G(Q2ZiT
z4fIfPTb6yH@i*kPQT4AM4&46LVnKYoX`&0o7j-6iuz??jMGF&Tul5N*x|GX)x1GFv
z!x=iXqkO4Y+bqoup)B{6C-s@I9@pUX)KWbqdYThDA8>Y$H>>uyQbuMKQ~JjVU=T?k
zS2}E!7=OM}N2Kv+(w|HL`-@LUID1B%r1i_4&~?Or5yp5O-sI>)(cDyzs$*OPbpBaA
zu9Pn`fn{!@ZYp!)z4`#~x8tsubSb($K!eBsoQ#XHaNgWqQ&kz_i3Mx>Q^OTL$3VvN
zCMnx9`G3X=2z2C3HAE;M`<pG(89=ZGzL0K}G(4^VaUJm&){s^0aBh;CE#L=|k$mvq
zLd@cX2A%(&wtFU|_#)f*biJK|KTsY;GbFqFQpiG9cM$rwZ_D%ER)2-Nwen>OVLv8A
zL25qjnM*Qr3vK`Em7HjawM5F@xA&wvN2Oged)PTonQ~}-e6Mb0Glpq;TY;QC;7ipc
z^(?$S-`+p=sr-K&opn@`|NF*AH*A0i(j$j}G>j5qgtU~TG)gx}hs5X*$$@~<pOKF4
zMp{y7k&<?Vq`&9;<6jPEXNTu~p8I{>*Y&z8P}}^mBM(6!^$FMq-Ti^YIk9?i+vD)I
zrB|05(mG^NHw>=E=MO>z4aF&4hf1o>e2NZqvFo;9`&0V{>Tp46C7e)e42f@0aFSX<
zDRsIU)J7YWsz(Yb{LNbul|lhAp>DvB`r!Tj@-WLXR4bi}3y)a$0Vwbo&{J0~<+$7c
znYQ1LiOWbYJZUU=_AJL+8&Ft*Us8+=8aSlQ26e5S`$&IC&uPd3T*C_sHDk0-7J~q}
zDYs1TYoojMzj$@HmcBDOMOe!|ce`lQuWbkR1j`Bi#Z-u@9LGZ8EkRWwYyOD9&``Lg
zVCdVN!ue7q4Ook&ClmywIW_PSWEU1{;t(n(7={;LE&;FD)j|4CDXvQfzH3dZkI3H1
zL}meo?mK^suXmLzRqsfTfp13*+DK@aYs{VDl=u~+>eeg0MijNOc6wzbyXj9v|EHvz
zyCce{_qXqJFs3G)J7OP8QQrF>vM0;7?hXNiE%Aiq*WNJ)E9>|B4zWuA%%ZXflCyVT
zne-pjViA{z_`m})PR<VHc|%SCVq1Kl@y&qybKyqOr8Y<+{MWDqve3tSKPUmoCM{Ka
z_UE>@w}bhhwI%vmIL21y*IY6ZeV&nQ9KQPue9HRt&KGeZIv}6$$&)}4FW#S&GISW+
z=a-~Fzk!BGGA%99h9hueR6yPdR|&m8eRO?JJX{%>%yjT@gk&>mS#cDN!_&@%Pw{UM
zWpGG~<6GynVY%Wy1(M<Nq&iSNrG=UMjmXD;*ERRoRAJ{YcdqcalL--tOk~ZF2xE$3
zi8Luw+P!S$)b-RUsqP+`+;=TiaxvZg!HJKlt1Xe=XuobV#e@oPF8PK9g>BI~2g*9N
zve2uDAX9hM%BfQxEZ`@rt10X07K9?fQk6d()fE_!;>L4DN<(!Oe}znF)+Mc(Ssvpf
zvYDWwGao?DIG#i&=Wc=p1?A(n*{S2`B<0C5C+gjhmB_c``D%U322{_Td^m-ovXNAL
zXK5IpH<>Fv`9=TjJ8gHgyh|1}*Ve)A(cXRxWcBMp`_ENf&sl?|s68TkiPzbh<Q2XR
zUyVf}*aM*+J~IE6uNO)Ts%;Mx`CD9iGM-~YEty-}prGRL*J)c<e>MZI3^Jn?kl)@}
zswidvZ+!;P>S|4;k(sEB#1owvAUoLlyXk@IuI}ZJAfD&9Q<h29QF^5<n8st7=Bhy!
zQJ)jvj@S{mME0R0@s#+jA(7*-z2zZb#iu?b5n!CjRWfTzMx!ZKId*${%T7Lge7q3S
zH6)v*=hVURO*Y>Ya9AJn9~9nn?l<F5lRTB{Kux}OSiO{s`U7OM7MJBB6~V$EfKWaj
z=pf|R7*wRY)G=OP#9$iw(`~<Xuv$Wt?32$#2}W*=WJCP^daDXKHu?vklT3bmTC?Qf
zHmNgd9Wo9h8EsuLLpzD$OptM`>#kgcEH&zVjh?|`H9p27&*b&K*4=76h!ywvucOM8
zwU60!$rd66f?~ruFmR9x;7mt1e(euQTsrjYS`o+nfs^g{iVoymdlLvG0|{O-_YudH
zpG&mn!o8)R9BkVc=mAl(keV3-M7r7QpJk)(pYb-`8PmdD%2(W%fE(`EE-?_sGR_=W
z0i-xzhzJm9{#m^kThny&>M@ONycQihO%f@AG>a}ZE_*B`*Hmw6dOYz{!g^gZjl=>K
zBsl23az@V3^tyF=hKAqebS#c0mVd0nUyLX23;v6lRaJDG+&Vt9Is(wPT7F$NHLa?W
zTTjzhI9e?zslvFv$szxK!5?!2o&5`^0fn0tMkwGP(Ot-Qv)S*xa8G{y7eW?E9NM2F
zBZS8x%cMykPJiMV9&>tW_L4<}f=EgH1Mg22RX2JmsTLa5SC6TQH;|FmM@YXD$Dbf8
z<S;KSZz-b`S;eLYQ$YkEhtI6tpQfyZT_@*|@CH(`;Ui)F;I!x-79X@sgbDwP2HS>w
zJRwnGb|xkApODgIP*jl#j)(INB_(1Ezn}IX8t;qs4duez%^SJ?%u^&=o)YIqtbH$N
z3`PH*(~4ETcX7fxqjC6{%R>#CB@!mJfZg+g%hhF^B=+HvVHOjA)A4g#m0P4C=P=^V
zzC8L+*<0<N*EI6dggm6RB0tATAP1J32WbSq!!^l=CfJCm<^)FY6wktuBvjlF6d#6l
z&$8U&vIEYJPh>pMRp-0&CtaG}_i^^G=$^+>jI=7aaKBrWe%L1<np^q3bMD1bM~22o
zS;nYArtu8k(5|MMAbGmEVN+lSy6fPOY&btSp@ZP4aDJ?@!{RQ4wY5O!qxNVlXH+-B
zWjjhq>N$Fj{erI181RU)u*En!3uvZx_=`517fkA8Wu(i1UXUw5#Kc+d*{xx4vzMZB
zDh~ZpTZZBy@<6s@#cw@gti5{wE;J=c`cxXHa9~VqQ0n6(Y>R%vYXU&_EM0^Qp?Lfc
z&@?tuV=SuKj^A$X?)=)G?EKH|281?jaz<O-YJg4C!eg7ydrw%Oh5{c9ta=+8RqH(t
zF5Aot8cOH={B2c4A3!pVy9LxVhMz;f+7;{oE;g5?N1|My{>bc%Z+kwivQI01-`uo?
zELAHiz%fREE;+P|6=^ZSUk<syJqJuklS_xMO8=al0ah=6_)d3^kMSF8Q^1hhx_WHc
z8T!4+!Z&f6@>xa>Cwsb(c63Yg7}xVk48RLY2mDkezgA20)|_0^78Ek#gr0MQ4z*%2
zs~{n+XA0gLoZaETT+F^vG<N{JxSEZFJrK!JbbaF+1n2WIg=;knMfoNLPi6Bb=vV)n
z{?g>eEge(2t*7?(Y&)h@en&)yr<m63c6yE?s-HknWYd}e+OH~|5pxo2l5MEP8FAC+
z{~w-;O&s%g+ucLIrAzgwp(BFN)g6}wUahD}WMQuD)7D%rD5PL}uk9B03f;ug&%@@j
zPnT`S_G|jGJb9H!6kfaZF5i0zu*YBR?d^rNeCJ^2aVfd5T2`W0Rpnv%M?#;>6u+r~
z0^2hA68%&{tgj!b)p2pYEk2=a-t5ZW15ewUkiX%b6Y5sx#`YOMC=e=+4Wc8q+2UbS
zKrlqd#gk9>P(FQe;<8fv8|!u5H~IALzKk^!MfJTfEixh{T>SJ@XBP+yYMX}>73{I7
zKAic~*~(gBS@#8S8{tm~w&NY3sXZrP0~wBQ!YL~NI|bF~pdBKaxEnUUJ~g=OHmGE=
z65Bxit|-s!C5Qk`_xp+-pJaU5yLWz{{<6B?U}C2?5hDW<AyfC;txJz)okkK{QHCrY
z`rTZg0}fit5c0tO!Kbu{h=g8YS8(iPF4%@aOhnl-GOA|*)NfPRH>E;#mX{3$<0zul
z!Sj`W*+|$kZ`s&rlIF|o<ps(@R)Q{z?w*9*+pjruG9r|W_d}`{to<7{s`+^}VO&J`
zyZum<yDWGuN0^J75Q6Z*T1$W)L@X~pyIE{SQ_6VyS5@Ni3X^^MB*pgSbnCpedcUSu
z|Ep*bTe=LcPaFmJ(Ztwjd-F!;F%GWV%~6!_TH-v;o7~cBj^`64<$MF$IQ_mBfI!Kf
z3@Tj2IcAx%{G~>Kr5!^AH+vy_H}c4Fx*^sDJG>-4AES?@x(8?WsO_J0h8FCUGo1<`
zK4&-<YzM3Xf8Tjrk^zW&f}`nxmJoP<VgK~*%$7ggFE-#+!AAg=0V@Gy6LoZZM~AXY
zENp%r9K6)@rW0(vbmVXiXeewFEWegR60J)S3{>dGfe4n{HQ;Dulx6K~dhb$zHJ(Ed
zjErQe3-d#}`N##|yW1t;mdANo({+E5^6zg7`*iXHAwT@<fQ34DaX%Y^Mdqi}e|pri
z%wwEpS718ZbOA3%_YBmzxf$;Z{<I!-P;v5;OpJ<Xz3)SMTk|P%4)DgPL~%olMqE06
z6CeMl%w&H65)0y7&(3)yoYRt`CQ-a<xUj+;LkXQN)6gbi>Jf@0qJE77(KNiFpGYn9
z%Kc+giry>V<TXV+Pp@E4`%If7a-VAa{q(VrFz$B7Pe0sPVL}SQ^imN|>VCj^OZ?m`
zK7BcGrf8dvK~YtLo9!1sOV|#u{+VH)%dLO2m1Sx2cdL)8^pV}~ru)R~(uyzhX8Smb
z#0hB{{ZDDAA!PraTq^w}A9|*(?Xj4?UPnO>3-$`fccW#0;*he#E#?lP+)sv#pMZvc
z4xFC){#7gd(|1fvxE@|t2>}VshQC$Y$5Ft6Yo4797n8k|%N>xOu`N}^6}#oGQn*}v
zc)K!`^)c-BNbCW5)r`k$qRWl6iGhA{g|{c}>qO&wL+T<#WPBoxto<=8-c5K{TttKl
zD&C)?G!2^WLfalYjSxf#|J+E^D=0yw5p9j>na4i@)iY|&WH8<AB1>1tWfWen#2ASw
zNq9)ji^JL2g>a~|`Tl?yx?^l`W^jdyP3RNg5_$b^iPi}>1Y=#@n}RH=<|F32gPF9R
zEe8#q<8miY@<u%-qR^lslVB-idV$}=6eul@_w=Xo&x1hzA~D<)E!v_}l8xuL>xog6
z|F*A4x<TTv9YzoamvkYOPD7ufPj0pywAEl5+Uef$-7I#hyZuW(eCePH&qwJW-jvS~
z!+n%I0HTHxA*Z*lgJL3V8IsnMmNv3{u@Q7UWr+J;nTIGVc1&)S^t^U4@$}+4uwx|?
zdvzIhe&g!$Dak6vhbYm?;b-<IGv~T9cDE;o<13m(kkSmZD$Kk)r;)BDgkoT9=oQnY
z0>QXSwiOF0RDW*i5b$bq*ARONDh%73bfRM?TEJ;C2LR>?n4*NWuyLtfG&z}EJI@Vm
z8NO7OW&oi=sTimT^e~9APaU>i-Zue&O|o9U{JXW#b-VQ>Y_;)lZ|~2UkI^|WImVhE
z2g_%P4A_x?Nunw+ejTg5F5uWb$vyR70?Kp#*rmft=?^JSo^u+|_X~>(C;ZaWE~8<t
z`tilz^)S@u_3!DG*znL@4{Sirwj&_%F@aa|as@}J14r%QA7D-M&~>T#JocVWSIm)Z
zc@<Bwnzj=<HwoBydZ|a-9(%hxd^+c5N?1Od+7>D`$W~65Qg9ZyP7x*qm+~X*oU{*C
zHYYg1s`Of2p#iV8XJYMhxL>xf9e>JAh&*fpU_Pt46Eg;X4&u=lu2sJ7N7YXJQ6SjR
zN`^8bwi3o}t@4ONx>%`{jyPQgN;q8ZVEbn38&38l_M7i5;J#g=dse9DbxI`Oi<mJG
z%u4ud=XScGsuM$unJ4bG$Rxng*M=O@tG4{8J8a6pMTYW+T{!06A9t>A63L~qG9!vp
zdVSU}BUGP#_GHEUM9zv*+}R=9SYIgFvDb>K{?awGp+zcHBoC({iPZ2Rs7IIs`b89p
zIO#_Z<1ocknxh@1ZU!X1O`$P6t18rhhfP(fSoQ-T|KFbMaS5}P=g|~KUrs;|N61kq
zxmk(`nXo)XVv^muATeV_MyE8E2e#^(4&n5pB?Ifh(ymLd%%V!$^4Q<wQ18fE*hy@E
zpRqFsA+u=$)JekaeGHXc-LyIy<9{O#c%ZSwvAB|CtinuGxl~csb_)E|SNy`egUUDw
zJ+$QQmz4d<u`JW!oDZ}cY~z2%&>{~%RTLQyh0|Wt|Lvxn)I4w`@ZhBOS7P!k!AoUU
zP3CM7r9bPtc}S6tgWx{ia7x+BMJgQL`|QKtB~{QWEIV5s*VrchaQb@+8BW9Jfx*ju
z5#n>wH#jJ>`P1~wh<X1em6@WfW$@t{P&-it!Z7=O_<T&cp?*@uk&0JVr3%(tMBUF+
zZ<N<*H{R0z_mBROZBm&2NY?HBJtgH*h_`=QhF8Hh{0kQ(DNFCC9yG@wXvvpJ`(Bv5
zD20W?;tlPlWGMb7t8*borbnV@+e)$8`Rc)`PNLLkeQ;2*?fD2q;<(e@Z1{b#3uuuZ
zFPn3xMe{|GnKK%Y3s_)(JR?8onKCZy3zRT@0*1cdwrABPCmF9f7oMi<pZ~5#X3hUT
zzvy($W9;`Do!Ntx_#l1@4?LqIJHUIG#LoZQGiqIVt%DzX9_W3Es%LJg5{yJ2T{3<E
z-~~_bPBwv*{<+`ErTTCW?Yt=wIp|>;iiYg~gS!qm)?~F>YESBdkpv`JSQ5}@iRVlz
z<-&uza&Ky<SaaKiyaY==qh>lK>BdZY*QrZ*$EYzz3V$V1A?esU_FfzV!*PxWKXAMX
zkiuDs;p_5)5qRUH6&Z>M*Rxi4SJvn1>h;&sx$LC8UxWic6K{)XkwNEv%wy)!%BdiB
zQVs2v4C>c!XnnUA6Zlp7`?sxZ5#WsEB9LbLnCO$TRWs-D6;9>G?*l!@mJ9T&V5@?%
zfZTLWhd9lDLi6OzZq|G7dBzL*3)e|53&AWDknA#9I0uBLy^cInn0+n}ck<S=@I@yI
z(?9=4UVm}Vc}3}j|Mn|(6iXH#9C{PDViddBK|<8vLp(&4xv!fh_cJ8?r!VDCQ9EWT
zs!Pcr-{zG&PztD<f+Vefu3O2Dl@Eh714;SC1hgadyWguq`|#vR-u~uw1A)du$Qd?=
zX8yFXc0T<`7g|^lF~uaID&KejDx{I2A?*XHP;*<85`Up`#p|mlKME#(k&hHEUS{RS
zQSh^t{W!SmzTnUKl7O3<Piy1Hf2Qew>@uV#70COC>k@;c%GnE3byXf3J}X;M#_+9+
zJy22WCkD*!(zE|1P2aq!3}K=vilp+O_%c_R;x+}D>Rx%y%tihdlCYrw?*lx-aV3|Y
zLVl+V-y(1*6+^p2(hM2i&)BNnG&WCzx|2sQ6yBu}vxrH`+;Vs<KS-Q@fg^QYXURN(
zccv)WFqePPapVht!>HNb*$z`Go^qm8BoWZzxc9=;FVscykpm!q2ZDo%K6WoQhKN-9
z+B_=7qD>wGL`*aI2w}4(0glS#5+bougxYyP6rb}?s20@7XL76dC|HX-V;bdwE79@g
z<fs?lj90*Q;WnIyPLU0_5yt^YDhJA3sKPzcQGdE24}=GoK&Lh+QhbqOCR|(LP1hx}
zD!@2-b@eVsn8z?_Yk&Vb9to`I%A<v2goXm@j)TVh#)4byB-bv`6_A9B(4dgxj-i+Y
z;@+2Y!eUu}W+tBk_ENy04Ty?2exjmu{Q9q&%v=s`fxJ8p8<)hs^kCyksiWX~P5oz*
zK1xyZ7nlE&FbSa<=^8n4#+((8HApSQp}CZEt3ej-Uw0u_X~yxK94hz@FZTE6pSHK(
zo@`!z47>RQxRO?D7EJfWbUHAml8BGndR}oZdnLZ!d0F-a+vZ-p++g7nRGDTJ+Q?sm
zaj7*o$8l{QKxzcNJjY&%d|=Y_ON`SO<kCrx24CEP&mDOOoN3Zlue+c1hqLix#dP#5
zFs^3I=!+i$$WW|F=}a)GJq(fo>_)ia5K1bjQGQPA@exN;I(tr`g`#zGNX3@CX$`u?
zB&SqZIy(!cuMW@3n0Zx|Q<@<Xt3wd94=+jgUch<&bC*sVlKK@U*Cm^Gl{NWM_1JH$
z;^eC7$xSEEb~=!iH}6e6cocE{1=w|V%6GK`B9rzWU}ISDU1_v00OFVirK@IW@siJ_
zC3a~XCh^8LmE{;}QBSX?e;8SsZrjRdAiQ__f@>D9<mE%cfn0jf%G_P{HlAK2QT)S#
z8X{D(DpO&LYIeqexvxTWR4mb1c^Iuin1NhyVqmfwFV(V8IV|~t9t0w1x~=Pa1R__u
zcTKS(@oG}tRSM?9{foUiJN1G%kn2%Q89>N;Xgu}6JTIL)sGxk&WhT39bH>kJ^!dBn
zHp}2f1%Cub=tdz)HaT(0AlDv~$gG)Pt7ek;oZ5K1MoatBZg>@A2pAxqt$bM^9PXoq
zOWAU&=sJwG=&H0Fxi8#>EM3C3;9T6)6GyU|ao*7Gy7xj*vnUPRT$w-v3i02>UKs)F
z#4?_uAjOd}wQ>qjDr&EgYX$eAzErp>6#p_d5dxjL@N~2(<;IUe`j8JVCJDXm<TbSh
zDG%6+kz?1J%%_NJR6)##t{wZk9TLxB_ioy8Y34~ZqWVSDhsAjPn51S3SrzEf${<?n
z^&}0vhy$;dp;*?<lpkD#>yb@_M8-wqCMkfZAs!yyn&nRG<=fj*vzQjm8EPMcZUjzE
z^qv$Dqc3*Ceu=uE3MJv}8+T2l9Cj-2yX?pbd^4x$Dr+iAq{t8OP8mgT*v=jbKgTx&
zpE9Lz+2I!!k;aX<6aWqo07shT8<KMQUu9hmmw7yTE<-QTat8}P+ec=5{r)ucF%}h4
zumjfb=cO6^HM4i-^BE}(9Sk>Ae{qO0Y7o}qvI%ouX*|rW|Ahi~uK@2IO~mr=&ch|(
zrx86`FGQnYPsgba*9p*L-soJO2OL!(kOSJ^*qU#v9hJ(aVY8w4Rpbf6!0V`ENap%>
z3wRmgT|ThN<wl^k<5F8GpQAMBwjB%91yP*g_0vQN8R;wS&k~IOJFiy4@yspdEQWm=
z1~&Llvap?p$3PNj1$T(kf!*&U9c~bHtMw$XzwKJ~`N(_XExqvJgW*XdOc7s-0Fc^@
z{6uAg@{0T9Med9L@<Rr(W5-Hdd7~4D6$POJ`yY-kBr#4l1!W`TzdMJLGW4X!%TP4z
z%G=0XE^6zZM6N9Z5kDr|Uk0TC>gi1(06}fPqvrAhSYv`%)g&Y=3~)YHa^M0OztQ##
zJw-hPGJ*#29Z`JP8G3cQ71$B4Ca4_Sc~oOdj=$LGY68$`ArU#tAxjrGtw~B>drC6?
zx!%)DJ3TdUpzPDg3B5lp)5&_x**+JtVkAo&^FmvZE|i!C4<Y-=6~V??IQ5F~??tM4
z*#2l&hkCcRDoBW0db&xNOxRQ;T*(>S{POIcIJN}@68g1y`oQDM;IwiOEe@fV$MZk8
z|Fih6Y3mAkNc!+dN-kZRJ+Jtc=sN2<gwlC3Z0{v+BW)|Gd0QJwtA`KbVdGR#k)?G5
zACsmGB$GFOM#wKuF}b1FOxI#%?AhEx$WM&P+_obl9{Ye^DlkqU;z-6cV1ikdu5C4+
z+!!&aZ&w+!C6A%cbJUCVxe304aJ$#Fhgq`X6$QP;aASSGy~rmAE9*H-Ti51LYQwng
zf1af#DDkrrYfIzGgK&_bE)&qoTn{tlG?Sd^dv`G>&@>%)s_M?WHQ5Kr>)L%(Wpn4(
ztENrUD-pi^6NSQrO%6wxMj%GnX`bEijvbu(ES%=32;a}25tQ5^qT$J+My+TB@@56+
zSn#jWUhw}Sl?DJak{l*wt149;hqh~j^z4H_SG8i*nZPePIuDiNUc}`DrHGI7K>@QQ
zLiXBf+qZ)wlCLtrwPU_OUt2R=Z7fYyv7ZwB0oJL}9kX%aidKetC?tSXZ`tk>rYUV#
zEdK`*ry8TR#%7Ij`GAql$IfGh&l=i-K3jl5Pc#vy9og`mTjL>LvT0Ii!NhCOUx2J6
z#%w?bQMqa#@XCd|NVC80)&urvjRGx7&WE9vae6tNye9z#VC!4}bsL>t(HIhz^J=@|
zOUyWMt6p_mKmo`DAxTlr%Ah&nZn=JuqTrlSgeI=y1Isla%1#A8I1qiB>6+_AI1Z=N
zAzX6^x2nYHuGdX|4)x_eLW_<zy1_~SZS#ZqSO%wZvFF#8449&+3$G|%aoZCtKTOVT
zjpdhmm3Vk^dV1w*^@6LrgNi0xhuC6b2S6;_<a2{P;pQRKSKFh#l@1OSzhGIW&J<;A
z#Z{zb#1QTx6m<f^rlK}c{!Oky5^>5)&5ClIpPlGZz8NvCf$`0!+x#2jFEK?Nv{ue&
z`Z1&QtuMb&zPqii?6MHy=OR4M;W!G~Bw&t*H5p#=A4yIDpxly#exADUr7N)9ux!F)
z{5kE5HFjh10r>471+%c{em9f<Zo=-*o7PNSVOw4QPGo{T=>7P=h@_qUIlJw<T>Iz+
zoX}AKx8c>c#x5*s^5$oXL0REhr?ux=<phM{S1jcN7J^NH*|Rq9du{tmTK&=WsYsm`
z*?kxe0^dvYZzI{mo)kH@*hGN>V@WZ_7gv-aphBVitUnvTSkPY{n@J5?8P4zSNWKX5
z?FTTjze*Pvg&w~aszsSg#Rmr?`pbVy&;Hc(^OqD;LfDAC#G}}VXHy}~vU7;_z4Udq
zYz#d#N+Qa;rZ4^M;MON#x0tx7BC1a$;!B=6&7WoP^^aGPzT^M<>yoT7YgjS7I?A=7
z(1H?8N6AjZvXl2McuY$<(Y*idrBuaGx+wHnXD8@Ol6lv&cJ{iz#924%C55in#Y;6m
z3%8Xs5`(T0))|+Q)P-$jBR8F1aCY@|(Zf0qV-x9Ox^Wl)b!mV=9NhY0JyEDp^}O0C
ztL*i2>cp7b^HSA2@~Lm(&EcizE4%`uux~eQ0eE`cM2f8IY;MbKO%~I3<xQ1!^BByJ
zr77!bsac4KwZlX|14t|O%MVb`O3-l;A^DX#DqWKary0qsru^;72ME0Vz7_K2`56ol
z_{4CGs~(3p3b(={+gh9Yo;`TA362A>_`stYvna>?SvUDA%--)p^$!iSU~;G2n}|e*
z_D{sLYIh7|^%3{{-;iG~IyyQ^GJvan&VaN72+5}E(bd@{(~ZS?^UkgaG&3|bTPG*R
z*eVm#Lo{cYQXOE*>1^q01+T><BOijuiZ<_Bk1*6l6&HZ45A*>5;t2qc2>p9HgwjW%
zP1f%YUEhoXer|HmX{ZJO^)yL0uL06iZ53KGU-;w7;<6ETxd7z(Q%lvm7Bh2s5mI^y
z-jA!fGC~7-kJ<V;PB{S`KGKeOsj;gwlp$nxUS+?|2|vxL%8gq%rR0}WDdl>ZV?h~^
zmIyLn-j;nJ=Fj=aLZb+~C89M0K#?1P4Dl99U2yE5W&Qns&od>S(?l7ZuZ)dl8Ed1q
zMxTg2uBvZsYmMH+VX$+c7c{{KM}&PP=p|qiV#DR&pAq1o9n(Db(f?p_<@!2qTv9aX
zq2ZR|_$?|*ZDfoF!g9p2v0YOsf6cFLV1umo{)IG&q>`6ntHgYnHxR?83KxzUuU$Fz
zV<$kgn+x`mD_|saciTE=zd6xln#ONfS!hlN3EAbNBB={Gd{%R^uCOy2f-UoYTPcjH
z93`JYSh0W|8+B5vzgMNKdYWU0!JSdNkf~RX+P*}U%sF&a!PqEXG<URocWElg7#dQA
zwzhQzT^?Ded!o4i+;49;Vy^v{{K+1U(T6n~kb=M;o}{FN{ksm7Vj`1Fr1XThXQIC;
zhKtVC3Oc(gd)Updj5^uYd1sd3{QElEtZhdo#|-KDZ~5V((}orZ{qEz+$G(A&mRaRO
z36D(uEJuwUV)=1!a7=~1`|YxRbL(15&qp<~nEe&QEsO{!DG_s=3;5lx&&E$+qi~$j
zqpMBp@$!>;s&8Q}N#--!JTQzeZ+)~#wTxnprZ`G3SFAG0K<xnrrXkA0RCP7?F4E|O
zk%nyrKfi7slv>J5zhlk4$?@1+@D-=k<~(V`gdhS(p?8!YzMoSoHXgZDq~y^}|IS|!
zr!bX>4J7=A+!g&>795weZ5dl(U;4^Y?yhv=KMs0+g(F42yY0T=Og86_4WO}oW`<D%
zfYaV}l^d^h_C;+C2Li*dDO?RYMhV`0+cGtr#%}33wY7CM(Zj%g1vnmcusR|-Ck7Ln
z*;h)mcGFPZxZHqOUk1?;^&fZq;15-MURW6k`ZAR{8hro<p9uO)z#pbEiOzo0eDGU8
zi=45AG>Jl@&O%J;*cQ>h7wq^$kr+|VyUf|Yj<S7$NQ4b!gXlkxUy1CDjb)OIwMU6F
z5;zyUHmv<VTKiS+t5oug7y~OlsQuCtWBeb+bTE6G)*0>K^~Pne^SF(+r$u(M#BL`z
zvEsjg^wpcTHW_DBmgHK~?>%}v1*B)!nkA2rLS4~#kfk$PJQmzqt?I$gwKM&Ah#s(F
z_qa>m)vmb5;6P%m@xI2e0aHem*NM;DkdS~tlsC`@5Eu}GNhll7$?={*TBXHUEMWA~
zgm&7EB~3oVte&0;bIYir{AC-Ess7;xEzhgwjdoh3b|4nfgve=CF#XVr2a%Vs(imgs
z@fL84XZx(4=DO1eY(@;Dr$h`Z9YoLDgjJ<$R0zbd6|c73jjtXEY{LP9a!+nU^}Y=`
z$k?f2;B!EHT+ZU)Y>9T%3!#|WuN@5mMNP6<x%$MyC3kMe9s7D~{z{2;p5i9fX8>(#
z1|SE$AfMJeaaMju>cQ2_$15oj);s#PTFY+<EkM*P4q9|;YP(CS$5qUdK3+k+cDPtA
z0Yhq!TkqW5(~6*+y)#k~PMh|IM$Y-$BL=Ij^g?{lgT^KQ<F%FL<pP~D)K11XzovYR
z`@NlyK!(efys1g@-_ZNi7Ov@Hc8dvm0>ThD^N=IIH=W+uGm`#HJ0~38h2@$pUbAec
z$7WiYKS2A}qzlhn9J^|a;`Rw`z8eaxG`W7Di~6d<3u;(1KAT*VWt+ZM7GD!lok)Dq
z*}~quE|FKX|NfKxZ$(gDT6~5X2f;(RdV}iKXu)VBWsP}iHmUw_B>pZFJE%%ZA$I!}
z1t>lWe?4<9OWHIBa;#tyR~V=6Qx_wx{`f-mnK%{IgS1lOiP*vP7SaWW&Pixe&j77W
z?MeKS^#a^dc)5Ko8T&S8(zakwHlen>(8_*c%JAEsZ}9lxhF=q7G0o>}X=o|~Qi16a
znJwIP9=G16#q03NynTtVm_k=*J&U~+!*rm4<>0zWOG1K6_ch}?Qh^WO1Y1hjeu{K|
zf4b01P&i>i%L27oIL{kbdFkyzqhIy=Dwt(xI;d;KMN!?Ho+OH3I1!cW-9P5*hNLxL
z*j{If=ggcBAAy&4kMpXtkP=zBnVRMSB_*2K7fV3~y4Hx={vP-w{NW4X;c==yU3Com
zV9?}PY4-{_BU`(sC<y<U2gK<12)l6(R#fQC4n$n6M$=D2Dre^fv!;iwV7?&5s1iE|
zmY5{i4>0>qONO~KLAP@RPPp^%^>2=?Ll{H!<l;NbeBZ`;2Ww1`4lITj^r?<Y8Y`|K
zdb}4~v@dNImtzi4@)>2;8l7+MI#~%#n`Fjr|6Kb3Jra)fYC78vYl<G6$nxrzjMjh&
zX#IDW#w8q3bjxw;yn4a5=cBSck(5tqZ8}4n`x}^FUht3Ojz4{*gIH}6f3>ThPqe8`
z1Q-gmByJjbapQwMCvL#o0fY*_zoB09Bh)6^i~v0E<oDVr)BP<kx()jGSiynW^L#Pk
zLsxZ+M64_jEk^Bp2W==*ZV^ND<6)dB+J=X2$^-rBJwvm;t3ir?*zsANssbi}v2*hm
zGsY-{_|p9-w)$hKiH~3lQ?jap5YQpgba>NqO=TDd^Q|E3N#U4iIiVi-DWUXldjt6X
zZUTe9LJ$aRxFwM5YlvuySd7|W>*hmiihr5F#UImOZVMH~_mZF<P$1^zUu~n0Bu>4A
zf>_$U`y2p&LfOp7XO((<xD(oUqGkKQ_8>Mix7742AHJ9d52h=QfcRH{LmF_S9(T}J
zcN+^?8_IrFV9C-I%rKNT<r{Jt*7_f4UGdQJDhl~EG1<U=JHF>T$!8Usm%>A&ih5u!
znTE_DkRo2t!h2_es4;p|x@SrG@nQ27VKWU&3~F|?<jvKZZ=O*nS=VoKES|RV0<}%!
z->JYz@UN;rkDfIff(#wM#lN@VQvrKFGEe~HuldsA1rlX8e5f)?70JtEY+VOWvlkf{
zQSl}J_s7g9N6F$jMbyN$A}7daik6mye&<KB?Sy8cUAu%L_*FUi*#ELg!Y|8~i5^`i
z;krpIS&M!K$H-GSR+%Jc>3`T3!(TY|53!cl+B^+@fxt=GW%yu-UEW?8Wt`LUm~B@*
z?!hC4n=M4dd)aOqIjPVtEsuzt{<!~!fJhiEh0#_^g3LRb=QXT-U#YZTBLs5^KN|+(
z>`QJ0zS|NpQFzk+&D@io&@F+sa{p%5m+z5&StTYnDq=)NKqz_h^lf`f#~c@{LNi0%
zcaAqO69Ror77nEC^nAHE6+Lp<=00LI=9U(dA*&(4g?Hl6cHH{P7%N-h<cgX9=7hP9
zk7C=M+<DKu4|tI23F%;5Pjcroz;zks4v!_iqAOeMpc6C8xgdfr&gb9;sLct^?8QkT
z-^BD+xXZkoap%Vn0BH?9y1W7Y*;P925%DjlCXc@IW%ksoWE;&n=0@dx_z8Hazq{Bb
zP6Q+nDIwVOwoR*qoAbTd@##+4$wSu8@0#qtP1MiqNlktaTJz2Ccv<Gi?IRq$v?K#{
zL?>>R%*P-t9;!QHGpcgBCTFCycV=ER!xt8u9+rAk!D5Pl0Qzcxaf_|P9U+KVTHAJ{
z1XDQ{8HMwXD&E-Z0iABQOCxStw3+j!RKeuK2hTVS#SdK*1xnt^Ck=`mUvol%s+uth
zh_@ip*ja`}haG=sxR}DZqUXw*-uUn7sI8!ha)*DPgBtAcvdwq)&Hqm3pd-p_WJc`V
zqG`qL`1t5z=}va1?-Yeyb`gOlvR~YUin=6@TG>|T*OV9_)M1ZEW&(b=N#3j^n`C^M
z%iS?`0vbOy-&|AFI90nDJ7W%PtCrCi^LTGT#Bn}rOhJyBE8jO?$2Ml0c&@BLa<yXg
z1XjxacxB~q<tq68&tI-d??edb@DG6BFK|Xz><6EqCEO?=npCZ=&AkrvD5}*o3zW)Q
zhq+47O*S&H;PtjTqGkSHue*^SD?goX{n>m~Sqv^T`>?#+Q;gWCOWs6doSFddF}Q5O
z(`D~J&kD-X5Nd%UaQ$j@gcs7XiF-7aa6c>apK3#tai?qdx;lB!`RhcjpGcETIg0M$
zbv@s~GnI_NR}9%BM69w^AgS|Y5HQpkIB4XlsP_KnZRDlCPA&CNVeTE9z$;CoN<+F=
z+?4?l>+yX8+w7ksX+QVc=T7PiE=H6=6G~*?v02%VXnDC(c1J9`-ZV+JQ601R-5idO
zj{}`2JJQD^L`ILiL*<wXEK@$Ru!jA;$|796+I4S(XN|U}GCw2W)udpVv`;LjXTEs1
zKSB~gd|kRul2%^F#vUQsyJ7QR^qS_21)N7&;hNU^0-Nb<g=<q@AcGIY26Okcb|3j$
zk?!uz?csk9XFHK!GxN?YvRqe9aZR<j5l|Ne3kyeoIFLJiLR$OXtxb?24HtoE`A}Lc
z{27c3<rlyZ7yD-rGY4w4zHlHEQbRp3O{BN|C|zcnnu^NQGj-)(w>4JdL8$FM*}U=y
zW-dD&-Q<CY7q@u3F-^jvJ9X$VUsfnF^PZ0wu^9<M`sj%)x5d(G9*|)HcG=WBilPn>
z4e~=g`le#RW92sVgk6Dub2(^17USe-1}b**d?}YMd*_A~x7TIa0qQyDvsZ85P5?*h
z^6tptDY+bI_J@=61UyBfdQ)r?F?$}e;M*sZt)G$Bb8zN4VKF!=mLxoQb0aw;)><;A
zOZ@7A>6|I4KLlh$?qDu6zB!7ub^eNGew7ltfG2&DtfvWcResC#r0`q70<mOUo^=C7
zg2tam_s&5UoL<%A-Xl3?9L3-c@1C?2m>O|qWiKX9y<q1%?*p!#T#X3=TD`1dvh`0K
zdJ(&d9-aHGv;H5de0SgvK_1Daj*9(lOH0+q*V(4d-UT&!?XxAF$KEjZ=A}Hq$*Gz=
zQ3mWImKVo5)CEQ#&z)cTyI<U_NKH&m>g<dTs5#3=(JQS{g)C5nxElePh4Z_qk|<&<
zYP>r!`q}JNww{-ocTURC=9Y-|%or4HcpQQh-qA$DfY0clYF39O$M%hG2u;2(*$p_x
z$!K9u=b+tM@3`!VN1PNWZ+lW(8%i^!z$bfcybaakh6NaPA<!SkmnE5idQ|E@LnSF0
zw1ZSW3vSVDpjwrQMYd#^?q3D3y2Ok-<mnA1DH}#BFRu;Camz5`yIgi6gmJ6q<wxr<
zjYDKZAwd4-hpshVp6P4BF*2O{$(-3WCSB5P24Z!!NZf@7Sz5OHDb<T<FBbT%M&+*@
z`0>Q1zB;HuaCH$vx4L#Y?U`C6(6o^lduu|H?7a*;5?cJY2g3wpcw2hU4H=ODK}hsV
zWl8E5x}2@ZjNd1#lo?c$Y}oh*ffF+j1U4}EJS*bdrYZHRUil0E1#v>PRe&2-cHzhB
zL2K;Yy?<BWzpS;e%kA+MhX$P9LG*ab7K9Y(J#urrx+2@&a&>-r?B8~{cAxd{d~?&b
zsViw^FxqFrn*-q+&a0rWq|yyBw%T!=X+!?-B_XNu5U=5b)L{zvO<!4@={o<JzV5B8
zZn#u`h%pn8VvoGKAi-njeJoA;>TF8mJwAvo=>pS*BZAWa@gX+!IakXVcbG99#mXi%
z@b%Z?OQzRlgb>Sv!aYXeU7ek?Ml}<N)mXiHA)l4f{Xo+~O*XKyPcrC1DoL7&Y1P=b
zN1|qB#61D{D8Yw}b1lM>%Ejx;kt~lNP3-6=c3sca7|i)iS2_u{4%V*crdc(umC$Oq
z`CW9dB$tg6#5FFtYRY-!m68=zwRoVDz6TApsN1rOD175(zYw91nELf?_0xH~M9}o3
zXZ0&?HRO~*+=B;Q>hB(ws=#{3XQx(!Y+u)^I~y8T_lJ-P3kNC__o#o$A6PXTj*P6l
z#Ce;;Toe0z;T-0RHK2_Bp9+XjcVz%&Uu|uj2g~y9%L0%2lal#$Icmy~<7J~~ib!Ej
z(3@h5HCM?H;^&4>HnY9A=k*dTvOp1_N-P1aiB1tjkRV4=MCB>;0gy(WMCIeG`FbEU
z(yB@yZ4yBq^7&2`O_EJLG~W3<)^2&##}a*8UO6h3PQDYu-mU^-onNMHj10uG%r$%`
z258%=8Lu;13vw)9y%O96TwHF!b17@f%Wjf+w4W;5+uQjmVwH2)b5CRk!ykXoWr9qJ
zCDp{f#7`7X=ZNj^P0D*cG?wMq3g8Gw?F&SqrSx%AZyJE<`}l@_vy{~dT@(Ax!a$x7
z%DJPC{>Ddb<n{{1VZ~m*C?9PKUvMfb@TMI1U|xjo(@o`>FI*<SnJ~%rZ_5_~YA+QQ
z-7i<B0=$xW2Pdx9hncm9(#gUb2d!dMq3|q0VbL9Pe%@dI@-HkGj4Ur1$lPtNihtzV
z5O`<)ey0v9-i{%v)vg)!V;u>wIQV`zYgWNvNyhL~{PW+|8&i!bD0lsneQDb2$AO9l
zhURaPjS26!@}LVC5-4xZK=ZSNc%#y+Pr4BvFWPz8tku&}73SCjcDmuLC=MR>c~8{n
ztSN_ryDMS@Ow5Ff(;AL+D+#w;@Qau5gyNd-=n+7+b2VTkLIpa(@;bb7ym*kD?5t-_
z1Z)qGyO)xEHODt$fAWCn!~WVqOhIHDD&?akrDcKT#LhI{%8JWcSC|^?+~Q%}a%$+m
ztge92kO1j+7E6{`v(>d_anCaI9=N?Su17T=^JBv_YIBFxz+I@7E~4_=BT!ZSBk@!p
z-_OP}q=vS4m1v%>Lp_g;*y;vJ5I>>*KD9ws%t-BW^bc>Yn%>_1s|%Ja$V%q}8*=&Z
z-~7^9&yAaRGSab>AfFFO@qF-yk?v^b6ji+H?SNGm34|SbN`#1yh&5f~KVlI77}R{)
zi*d2HzZv!h_Q5%VE0@w6)+^#7QCg7x17U1P!XCBmethIH{$6uGRsavFW-!dg@<;v+
zRS2;seWU)!jBHsohw4l=#NweIakU)>{!QdAQ#9D6T<n;fW1Qn>yD9Udp2_T^1+5QA
zfiV=)eB$*x-XxOx(pqO&w259kUkAhZ-JVX^R}Ao^-o#1@mtgn>f~SC)72FH3duL|e
zcl>?n&~;8LTslrTNTOY)GyxxUYg;i+VX#G<U}ag8Cov9Q3A{3s-UKZuKpu@|#48tS
zGJ29FN_KqNHp^w;lw{vWNO2ZI6+UsZc`DUt<V0C;9lCqtwkru^7R<8ao>JjJ?X<5P
zjjab;^Bc>?!yg2(UJ6GQ@`>-r?rfeKJ99;~wcUUft3DXAO(tm-4PY|$s)Rl!51|@(
z>a(63FvHh^AR9k&`PgTFXzyqU1_;ZM3`WdY(;pqLxipzoCz<8_{?BRRXo6naVhv(b
zfl==W#D(uPpV~7ScADNKAmPvn@5a!lgY=3_5@v=0A#%Veq<=qtnv8;qxe){G2><{f
zsBGZc_=*mmtX=`~rH|=k)q5J1;V0R|UJB@zjpItTJIfAjEgc==)w<5(GRN(bZBGpI
zy)RbR4lXR#XkNJ5GYyF*M7FL&h9Lmh;``0_w6?^}4UadN{3oxS`OKW30{8}d+X%}m
z+s9WPB_GhvRA$qU)Bf{dW#^0dDjkpWN+5=|2ksP|breV-(FOl?@Wu4n+qr676Ff#u
z3icE*O;~^HS*2K?TRSFQUe3w3A5lR{O4brKLf^Nw*x-V=u|OJpA({MO(j9ah2kJ)O
zH%L?hyha%=qE17UXM}_!NrD5Rb;66fGe()kB&mk`%*xtD4*`|Li$U%)b}0qNWl}tm
zlh#riIy&^+&3gXQ`HKHq$4%baYS`sPHCbol6}D{Q>FwXs8SJzCt}yJ;#f4iJt6pMW
zCsvrZ`$~k>(sEn&y;6SJ=rdh7<<f3OG!AWdMLFmJmp!NAe@F$`;|4#NSczUqJ!zrY
zGA~8Vr^gRGzrTN#BfdZx<54n~=bHV(jo1@%$o|80Je%dSv)!0!FOWsxIv8ylKM&go
zWBVzl1KPFKRaEd<O{m1|ZU5t}jGDcaqfpY!7uIw_I}rW2l+%#0)GpC3h59b#&HDD6
zw!qz=Ia5ONFEK_u|NR~UNY(HiSu<p|r?{;;LNF_R$HcJxlKpSBPDs~j%=Gu%o?LU&
z{KOpi^?FAY#^kfh$+SYvx_Mnx*}SO$h*zK~9f%z$o-|2HRwNXEyG)>*g%BJEkrhYN
zb?`u0WxYFMBF_7!E`b?rMr_;V*8S;rT|NDudEdHyY40QUUQ}7xlaFNqzx6&U1_uT^
zE$bmK;%CyE-jx^}w^NDj?46(VCN;HLkWYJPhz{a`uv#ZQ(d$6-Y9{@=OPn<R*F2?D
z=P<4zy&<y3K|YkXI0yF70U)d4sg=!`WRN@u$eV#{z9Vn3n7IHJ{e=N~zojITA=hj9
zQn9mlmw-|<PaQ0l?&In@0l2tExKd{N(b4qaVAl^HqPfmt_#eG<u!#!711q7DS`Aos
zUzo4ONJ*oUZ)EDWr>vleRFS~prKD1p4U$wk`4d_N@YNaYbhx%OJ1$(dtw`Wc@{gf2
z;=?f+^G;{-QV(rvC8Nrt!2ES38GKOTXuuw4v;-ua$~^1O=|LHKZJi11**Rb~<M<=7
z$@!u4KC5d_5QX*gY=fzJK9E}@<F<o-GMUBNxv<K1riR0|dA*p5#h97?Z@;#D0#xpZ
zwO};3!yk<SietaK5jz&_oQllPgSvydmfY4lgPIs(=6_)KodsJ&-ti@5;}X8q(7}GC
zo<r?yGlMGUJ6Z4DZa@q51m#ePGjkp;pD%8#o%0ROT($7M$4RL!Smk~CGN%lhzh763
z>5LPeePpm34zw|ujDP9*SP+4Tocs2$EB#p}yKBqzPhK1=U#d3&F@E<Ir;Iqn;NcK`
z{qDB9(~99!k$pb}9ZId%{zT$RoDK;(1lQSo3q;*sk2<E8ATr@VPEriZCT{M1iWTCN
z$ok6Pb3@x#FZ{)|;IijN@a<<@&~#fR-?!mlR#3V(&Qf3Jzfk}kyAm6_l$QjO&8K`C
z5`5bRfSUUD-+FuB+|{i9J9^1_<lbblL&Cn04!;cPTJc%yy754rZ3<7`CM0gQ$Pb}d
zS38`;v*7Q_Z0-uw@MIkwlLkxQf0byJ!gOkoD+@~s{(SLf1KMlISxeceZg<b-aTqns
zW}mk(7s_m`dOD!1L3cD6D2y&(F2(J)msVA6oHt*@Za+j9lEPn$L<ofY3!>XSg{Bk;
z_@BQZ0NJQt6h@t0YzRQXE%d!tUOA=kw`)`#44HHlkFDZLb$5)S^U6J(OU9rs1#~fn
zgb!1ZX8C_yE{{WYTYsV2P^w{uZ*oN6L%41_C8uik36DE|?{>(!j{!*S$<3{w?I{&_
z3Pb?zA(Ojz#^<YO>26!K4(zRapBC!L=FHBJqo|7nqYmc-<40sEn=UDCLa}?XrSO!j
zv}g@M`?&P&aR;@!DoipUvjlp3D@Ex~Y>MGo#h;GfSrDI&_r2qgW}z&0+Iu&V=DmW&
zerjQ$xY1hRdSK;%Q1HrqsH%Z&>7?uOWP(_nISzjNoVXcHoF;4VT$s2iee~+B>_<Td
zzoxZgWdp-@EFD}&VayN_c`4+RK=V&loDK>==nrkAKWe9>Sn4etHnz>bW#Wmh)46kK
zz)aC?_`Q{5w4I9W?)^+}Q&u^VCO&WR+te2N<8a2WDFOEV+|`buDtbn20zL%x%M*Zf
z2E6@yvY|vOyc67lg4BA-<m+tO+il++Rv*WP?vmU((CcOWCvQSgg&{m<W-ojc^va;a
zd-VpjgVL~d!;oO_R@%ajyUw2sdIF`3HIp@OQvK2F!z+h}4x&g?2S&?5jjcfW{IUQ)
z#6$;MIWJI?48_nc`0{<LjQ=AxzWi2{{L)`Cd_5m8YM#I8q0(`bc+{KZ&`jp*zkjp6
z_gWDMy04;ISgDDUMzAadV?-e&w9(_xrUgs$M$;M!H$8-$vp73r#?hI2qq370YgKBS
zF~q>pUn#8ox9}UX{xwf`>hXCuUsC>~$9fcxuNxE9t%8`UXy_c#@wis2WX;CQ>^OW<
z_;e<~n%8=WK&SWdOE8_$Oue#+1W(n*e~|xPzMa;t+mCm_5#LbHi#l)F=$+tEd~kbx
zh{@wACQME8-()K6PNysb^?y0A>c=5%sEuso<}-J;f3x^#K4z7MEFCxJTmo0Bs#st_
zkCaU%e$;8G`4^wUF6aYhcG(myLMrW5z>vYH&KPr26?+48qPwqlwP^H^V6hu#?)UdY
z|0bW_>JEhbyK@gczh5~F&0{JwP*jbO_AU7prz1Fc7y54@>@;s@CVS`4GQMe!j%st;
z4bQ({A3K?zg#A5z$VQX|B0wT4aIKW`&8)wFo+ADGg@oT%8qdnL{=W;Oz03_djg>TC
zwTH^Fe5B2!Xj+3=xGC7Ic5!z<DrS_oBcAbOvJGmS%X1effC~b;N{#Z$*&_Ty+rIn2
z;s(CN=U6$O^b8Rfj%9vILeeUI86CR`4=;QxpRdq!1NzL5$*7wpB^^x%uJo0P0^g;+
zF$Rq;;36joAQkWgV)%k@ehNF#0z0%BP~Z@TW?i$(?cVZ=nEL`&KoifIMrT=uqw;nF
zn1;q3Y!}0^@+IT<4ntb5<JIwF)KK`&*alEhw&fw!5ppwFMj!dIR&X9qVZm3v-!V6P
z0PY^YNu4nF5`=NR`hUne4M=K7y#l$;G@tuqCI_s1wRh@$AmhUcg*Q|N9uJ;sB0?72
zB|;X?+56`oi3%mm4vzVKmw9bt_)uR`$JW`fWZc%TfHul30qC#d81pU%Du;m!na}EZ
z$WYS1ze8bTTwN_;-dGQ%WS8j4Z%zWenh67;N<cIMxNOxj0ieX-d)ed}hd#bk4Z4l_
z<~fO-mi;z*JW0B=6}Ox1alq(ve*Qoa6h8mg=TVIcPi6NQ=_6a(%u?|va64;^b-{Mu
z+qWTdJ{H&a3tgda-`0+{Wc1F3F8eckVGB*YARpH6#>We~;eY64?KGP8Dn~jb^R(hm
z)m<g=G7kL`)X=A=Z{Lgof+~O!-@Um9z&3nb?v0Gc?IZ5jonhD$?kEoiolP-@t)@9T
z{U0ukdDgnori`FLFp$XqMF1?AP=Ym2YI<%Cg`}W#67<=5E#_q90`hG@`Z2G~40L<8
zw+XPI*v^iQ+`774(IHoVwtPC-ipwXmgiD%NTy8>JWGBjIHqL!dm7QJXYI*{WUs}oT
zxa5@`I>=1e!df&c_P>P%y6g|4)+e8ORM562!}edUn{sr*=$(~ZH<K#Lj6QPf>9R!*
z=%(O5Or1(JsqydpsjabRD#2ZaE)KovzPK-Y8m6}<TT;_BYuN74RWk0MoPiz|SW_}o
z)Uz@I$Ns`GF~Bz*rWS7&0SB7=eXpS9vu%{QZr)E<KF2#velkAeyw}(=ULr;}%KM_<
zyv57Vn~VBn;_KrIvFaQ-QbJwbc~Je5f7sPS*3io(Dt}2yZN~zDw-$UK3vf!;?(b4^
zPM?GUt)2&$T|xgYCej*n$AebkKVo)qwCzz}cY_xPk6vE7hqi}dWs?mQ{`drktX%Xt
z1Y!G9p3W_I!=zqosk)XY8#BG5>8<-f9~_^jwOe}1KaTS@Ry$lv$$D-GPEBX-mkjzp
ziq1Qp>i>`8myjgxwMoX6zS$|6H(O-8_O(Kk9T%6(WZcZi%te$vQo8mC*<8uqWL%NN
zm7D#0|L&hXdPw))&wHHLInTq^=ghI=7y92=RC=8+XJhks9ex&@XN6Aqz!1x!cZVWb
zJ&*jH6>6%Ftk%T+`Kea&E-2GJ@9oq!yiROkJo<Z3nASbWAKg0ZYcq**Ns!1umpXAo
zqfn*e$;_tCE^uk-_|Yj>{F-Xtw13#(y64SGJcr|?;AKdIwRq3U^WH=1ibv8nheb1f
z4Owc-<>;^TKA~4;x6yvyJ49N=l~<yE1nRa|2@|asHB}#%romIdmOQ&d#$Yh`T1nzk
z$RwGc#uO{LO%770=a)(!SaT?UXj>yLlYIp;hH~wjlP&x_yA9M1aKjwpPA{46ve1UX
zsOR0KXSdm2x|U}QOb1Ey&y`(%#PayEwRA&LOO`3e$bnma>g`;KjyI|owFWEr@U`6)
z_)B%j+cFfUE~4<xTT^e|kEyDQ;&krcd7=5p4Dw`-TlBU-xou(UFzEhNyf=wd!;;Au
zo5|CED+V7vTqHiZ%hngU%H5Sodx@gRJ2)f|ww{^dw@)~65K}?&XZ*@Rw;MW&3TB`r
zR;ILd_!k!DXowQUXj)qqfN#MSbG$|qr0tvLtG48cAsH#Vs<1v;)wwC>)*<KfdC)74
zsY{ss`P72R6zf_E{}sFuQ-zlvmMcm{9{t+|_#M*F_V%qmhrX(5Q-1u>1G3NH)GbXd
zvz{1fQKkawVv2}ZX;3HtTobaOPe$CQrJJ7$ttzRugDf}Cb8~~!@d*nWbQZOR)z7+1
zCnY5Ta0k%8#v7LBo506FmK$c9drcID*MWQZwkNK8^l-Je3o2Inl}qB?Ud)old%Ol@
z2`3XbJ@jpHZeig^LP;v}tj>Tmd4Uo(sp7h;`7ga`*DtE|52EU%aZN<zP6G4_AEkvJ
z{gs<dW<u7@a%{*3@z*fXi|JJi`-_NGnJ&Eyzl^S}luCjb=ZJY>`ROE5+;{hqW&^`x
z?8dhU0kQX!p@Bw^YQCst3vj0YVu-VHWR)%!q3G?%z-3Xls9kiwde+U4bv3?k#!rO2
z2LmBp{`aXqm1qw-6W8*)uT|L{*qNcv#>FE!f??E^Z#PwT7Uxa?Lho$bYr#vVH0_zJ
zE{L7(?wl{j*eNQK=YckR^cRdtFgDywg{!De)cab|$f0BbUdJEOdKn{G@2ZkisYKgH
z)_hOadU${HEW9fr+@UcgK4*&)rx7Czi&<;G%&pB%;1i^ay;jdqD7qqZd&#e+-j>O2
z?oG(Z5hK**&Gm7=*Djq0t|j*B;ZevVRv#*=yWM}dq8~E9$#S0Y%S0mACf-nvAx$E)
z9CbaTS}QSB5Y4Y;l@r~p6t0y$qmuuY7G%+4kY3_|g%z_s1ohlkMfLG<JL=M<m%0va
zaBgC-BMnJF{dxz|)uR|+_EC5y=zIO)Wy#?|NP=L!KiPHEqOx$+rxmw{n6-G;fOGR0
zaVUX6M~wW%EoeM9e*gq1!XPQH?JLKzJyxh_ED9<n1`dx!S1m5KA~z94H-#q)+=HwW
z2E05AMX3xi7w&jTzX0#~mDa@_ULGFlrW~W^$N3kRyt%E)wPc4!5$vAU+6y~g4!S~O
zv{#YWufxFV0N7;Kvz~2RP5SC?ofBmOs;`iH2!oY1hqE*OXqMOBe#-o^68d`xC8c}J
z4)LX1P0)Kr?)WrosCUzBQ7S+Oo*a%oX1no)<9id?Yq2#1BcG=&L|FF!zoc?CT16w$
zDVJ!yH$WNyLtSfeFlSmgtn*s*S1d6GHuYXnf`aDfc~6tHE(8)>UbBd$6PvyBb3kp&
z9soYN*J57Zei&J?E>C=uQ=$hC$Bw7hjsxweY_2%b8;AX-Ji_6CT|PLFj(jrnuXRU9
zES<xlCUb%g0!9|R;%Z8uG#GKO1@a}dPKigq1Lsy+_+8i8ixwNQh-R)q-6%j_y`w;!
zoPRV)J{7{{+U0JysAFvwyP&4;YmwTlLh5dcSz48H>R?2`b}7#;7qE^&+V_%Vmv2x|
z&Eigv_y6(N`o%RuzY&42QF#)?K*B=u;kV(@M<<Cmy;}?-rAfp<8VHOg90dU8j*}Sd
zzqYVGxA`Nwv728SZ=k-lefTAzAf|im#E+A@<Yrvq@BZ&kKi^gu9Q3R-tHx3P@q2Yt
z!?~%gdtCI)CU86;q*jOZvt}C{H;`00Y@V?4+@S@L-2Z({Vvx!D^T%#B6J-|Yb2<y^
z!dkO3V}4XT{S{%WeMki`U^^Y{`S1a<xtFys8>w(`ZYr?t6;wmRGRins{60mBwK(Y)
z@L$M7klT%^jghqIfimH_FUYp$xweMm^0t$0uP~DRMo8b`+U{E0VO`k2PTo-N;-fzY
zol1w<VctO)iJ27kSE|q$MJHFZlZP037d8@jQ+#+E!5bDF9IV)?O0a(X{e~7eh$#gh
z?aAgc(9vEoQfHsEp|F5IH7UKzl+&Q|oqEv=gV`zSq5rO{LUA))h=1<YoIL+@gn`P0
z^v0H;gL!+$cNVByV4JqU!x679cRvq#Y9FK|izEN86*wewIL7B-Z_BW~1iq9ZL&C-E
zFKoFgR<G};D_`wcdeJToPa&mbUDRP_B*>Zas}fapf!}5N*NU2ZrBDgEUC!%>zUi5l
zCwPlIwLM~1M&904cdZnA4r-QcOmUFvDFeP4mcqtc*S1@6YP?tw7XVmi$$VW9AwH<u
zQb^-!Rw;<I>>+{E@aWG}2j2xw=Qlbxd<cQ)0zBpwbwaQ9x6jF*QJ%dpq2v|?T?1)7
zV<9;zM!55L`@+fFhDZ5z+}{TRgbRO@Wx64VJg^}E7fzp)JwMf*kP`f7>*B!m#wR1t
z>eQdNZR^J;W)Mk0i9*z&XeIqy$YKE!3B?1eEh`iCW-h&H*ErQb6o6PpAdui~77v#g
zV>*BO-o`7_gBx&XXJ>XsMuvo)qJkzPqt}t=)bCp0fHEP;UPg<9=0JhoE{@}>okoUB
zIr2msC3+j}&RZp}rGB~Vqr3lnp5dL+T<D?SU^bZQ1-TvTmHrM8WVrJgh{hwYbtD_N
z&4x(RaxjRxefwuDZ!S;UAljcQ)=(#lh!`?17xQ~yMB&{AmZk;hNU;Gg5d1Msr-!~m
z4O3GlI)aZMIXfq(0O|TN%tJccp5v$$^JqXvQ{86LikVprk%7*+3V)x;2e~s@TKS3z
zCig9vO9H!hcDC(*2RB?iIoU}f-Qf@51%uNV#vO{K7~Zpe(r_CjqPm;K5lbYMrpH4%
zzgd}C69(Z8tH}PmX+eppOe1VpvP7Z$PXE@jUDHs3n2ou*Nc+G9Q?>40X&X+^<NFI^
zbNKgRIEavLZQ6X$<@{R6T~$b%e!888t(Q+0u)XR=jbgKF6mFS#@N_<@LLldpZQHr&
z+=-uMU9dSt+Qd3cMOg(3OM@7u4yM_rJu#6Z1dej8H3(>jP$fMywNx=xHdMb1N*fhh
z5DL5<-+DY(f~%)TRNq|UF2Rbge-f94J6LAk<(q2Q$oY?zh=9FWL1PnNX-UeG|E#Zn
zI6tb}S!{d2P()fA?dbszCZkfwGm~)g4)56}x$St!Yw=2UE1s_7$;<!XiiR=v1SB56
z-WxNq@z)w~z0Oy9hI>}Z36G0S>kHzFSG@Z<sXj|%3s#c{3nu+hWy03nvN9i2Q;M15
zC2HsA?YduR<2js^ZD~kdS3|DVi(LxJnixqLXDeHl&*OZ->^J`+bo;&8&qLKYiz-(8
zGdl5d%8fS8-{(O_Z?M{KaO+r7`-Cp`?Ah%&*K&L+<=dwD?uPtvRocW7ymQ~x^gLn&
zCJ`qfqF-$hBMWPY&mbNCdeNZb=equsc3tVANM_)hJd4agzo~GPCTtgv|D1aq&E{EW
zWs1N3ka@}!?p(b9wg}y%zyJQ-?8q4C!#%aL%{>Ti;`FBp0d4kN;jcPl>d5#pq>mG!
zp%MD(=0D{T8d0`nWQNgTqj}IiN(7!YG$0Q{J*zmJbJVuy`LAa6len!ZS|}k4k&cWW
z>OPz!m+mwL=K26b`@lCZ9|G9WoJHJw?QO3V;Lw$|-C_ogIsfh43l|+>g**GSTZ?tH
zv(RE64m2andg&o}{BbH5u)=wBImWlg^z;<FaJ7&m?;Fm0kO3TtxN;bybo$b&rKxSF
z$yVi>oaQR*`oH;5V97};{{Qu@|5qsJIBXEqBq0opJ@Fq&RJ{<A>@|jq>bjDN8Lpqi
zU{?rPAEd$K(>XMhQ1*FdU2gQv8-Do8TCiMRD<V|VkR8I(r_hg9v?GKpwgr;96)^HS
zR&KSoS(c#=i{$xQW`$j}<Ty~pNLyR>HS-ILi$q*;AcG<f=qhyWbsYLcW^;c+mKrro
zZxsAYnQSpgb+rJyF+D$2yq!{*Yr>NEWrP6n+D+kym20;_LDkVXnK$$_+fJb_+!=`a
zFUZT=vvq_h(AV>GcUS1^QjW}Y(XC0kL3c+Ag-PLeclFdKScR1P4v$LFgiSp$J(X)C
zVfq)u!iVr~*4immRF_`#czZiCS>FuY!WQYMg{*0Am^XXh3)_&NDt(ZhaLYNCUF|hn
zH^RD8IAeF?nbLrvlbu!39qVBkx52hOCiB~HVUo{<nsCEWx&ca8CMF10%tcw{6>TI-
zei=w~=jAe{P3dKXurC}QvrsZcxb&(+O2%mj0NL;-fG6ze&@l`#zpy|%O&fFHNI;Vo
zrJb`kr;coUsW>wV{f3MqaQAsMX{k@By(VE3O)dAAe;<HHR3em=O<wpJAg>f6clI+0
zR8Z%6dIFo(4o0RarVcZkv-M1M!_~eDsiWqrNE4rlE;oHYUbej^b^2#uG|3=FBFVrB
zVRY@Dw2D)uFwZoM>84KBh=yNu3mue_`PMrUpZ@0u@4Bh)cpQ0dU?^V^FPmSsRvX}!
zoZGp2fB5@-h^=XFNx73!m9~T_{=v~^-KV!>I>s-ynl7-Kzux$(T9YFp7gMHQ&q-qu
zTznJstkfmE=@JG4&vamqXyp*qlfy6SV_X+pA&Y)Cv>zqQwXmf+eHB(bym?@nFEzAq
zymW!d(!#Uy2F7Kstn3Kd*I-soxo`7<4$pQyk|vZ(({m`DuGXNjHOl?uQ`nTZvyOnN
ziZA~^@(ws^y<nVsgC0XQ#T)1LWM?<E@<O%36Ec@@<<{#Ygg#n#KF03!Mu6AvSKT#v
z#D?7;>W{DG$gxp|Yf(cq35{PTVl}AZu$Zbe(3uF*1;EOA>lZobI6K|j9cd-D`U=`T
zkV*8BORB7u!C)8}caA&*?r~c=LVQ<^sj9YpvaG~xGEgEUsXCNTpE_{W@Xf&|Cr~Ps
zG4CURkU9XbuwwVYo3SypUzQ=xoo;Uf6{mVS6<OrX-MA3X)&kbq1=z^0@2Z*cq=(E^
zoq(73@0(A%zo@qE)+sfmKrZ?TdDL3@T>oV8rKJ@ShAV114nqHDlnjM4MRD}X@v4?z
zE`BR{aR;eQwV}305D+g{xcZ5N)2NpmCb{dMd+aKhzg7|`NH{Dgh!yfXK3$L+fc!Zm
zJ=U4sC9EMc4-eM;n`Xz&+}sl9qzv5XXG3;^SpSGyeF4V1$ll7A7GG{ppiqv^6Z#3v
zP4n(U<N2UGP6NX7b)v}o&3eNGyu)iz@ubV{q;}^BY9pS|d!>^`8Pk+qwWSpD|J_q*
zh=c=NqQ?BKkUxN1{QBj)n4xej{1{GzPoAju2eQijjQ7<a;m5uNJgV45yzpfHOswio
zzpDL1+s1h(3(MM#EyclTepUq$GqbOG%>OO9{Y7y&#976}ewmE<1P{om13ZIR;da-v
zM;oK&d?U@74==?Xt^fL@M&KFT<O{7VaZ8sEud1_{PNUtRjpAUdV1?qy@ZcW@oj+4A
z9R+$S*J*{&);EUILTF9aZ~QQ>YiZds$mqA`+L39|6!E4L&9ziXyIR*>P|HqX?G9mm
zo2sn>DM)jK<)E{4sNp8S=7ho2X+4$Y$puMlM2_Xs<YHLZ`S0cR-$gqGmsukId+hFb
zkq<v6XYTqvvZoYhJKW<P%2{mip@%qmh`=8PPq<ts5H3;1!nIzT^-;s!@nykk9D<Zc
zCPNtC#rV_+>6D_3ZX7cH!e4Rbaru0@0`pgEjmc3J{DYsRVcJ`UfBl+KLD!TmlC5uT
zm9G7um@R3S5p??*kp3XpFGn+$A2~Ta7ZL6p=Q!1uc0pa8p0CV#jHmhXf`CJO`^~Qq
zF5~OOAGcA-Wj-qa_AZ~ZjtDa7X1PE;>N_+lD!dSr+1PGLKgwhdA1pL;W)N@GZ;@R0
znEM#;peZN$1AS>t7<5`fY$f2OBxqM5g-nK!mlYsa+5sN>-#@8D<I2LV#!wE6`qaUK
zj>2_>9=oTQJB`a7W;l`{M&x#!bC+%~iBoG%2lb@=u_cxGK%A?{!G8diGohMM<Whm}
zhMN2hkmb749<vk-^{9+O7_46#Cj$`|^aMRDL=Bzrl-?DO`!A@p@~KC<(saF0+~?>i
z>KzFp-C*3uOxkDj^j49#hS5UP1PS;aL2eK4?D#Zbd8qnM&nl{aR>lj$_w`AY2Hw=(
zKM^db6nw;jXQ~BU0`Ssm^0JSdl2RMcYw{P}r6s8huk}2L%vuAlzkdZIpDO0PAmj1k
ze!yXVT$M+P4@dX)th{u?OFJp-gDJ4hWE8Y0P#7<-`F5$9QStMH;h*g$OyV37Q1UYF
zJoe9RMgw<X+ceo`2lsqXTyT*LTzr>7$KydrUEA~>^debCMkc&^e!Ct&nUNtkEcqVy
zf6)j*9P;mk^GFs!sA&8Jl(lW##_wi(J>;M<x}xZ7Lwtc*9n@HRim08)EF6ES@rg52
zF1FpP_PFh2pbccLOvk+k!EJpMku5$X2lnl@;6U!;>8UT3-kaY&oABhLpTRy0UUjok
zA{DNOxJpplE%c1H8M8X)XCDm8UVBD)7fz36(<bmw+WYk<@U&uco%k;mhB$BBWV`E{
zPN$68=xtH{%nM0B^2ViO9@%P7Zc;DKJwMzOmmC7%Naa*P*1_n?5339(q=xNS5vBt9
z%Y$FHY6Rblm?FcsGwxRp&8m(({`z1Ez4qhD&5D(i{64WjN@;E#7-uSsH_t9KFu0Gh
zj_h?XtEZd8$S<p62!GqPY^c|%NvHnAI9%>I#pRn9cYNEQ2%6vH23Y&|8zxR~x<_{r
z!x^2+Q6fssA^(0KFBI3eOnYFg44u~dZw=GGoqNPx3>@l;2BQdrK;S_xCJwj|ip?bO
z=^Zx{GhdjftGGz_xuQGJ6U}4boMhWl^Iy_iZ8-c1!JvN$Q6eRgL6Z=8$2U8HSHdv1
z#6%VO$l8uMZM;XrTQb8=yy5PL<5~9I;VS0iXfYFyhqj^*$9mswB|HfUvHU96BbwM-
z{LqP#g1*`VZ`*T~+K_FfzlWm*eQ*@Si>jnSlwcX#r&cP(JgeZ}3kh?OUO9Cs#@bAP
zyNw_L>wt4BZg~92(({wUbDqBJ+{vja$?nvYkweHA`Jt^y7GQ&e8VL<7I^l{~mETRg
z$FoH+w#QkZ^i_O97G=aMO?IBt&HwUm8oM&MIpGX}xQ9fo(q~nqRZh2sW*Yqt;G_;{
zx^~ohC*EzNY1b#WsE>w-Blh(4q<*iSeqVLRV^mh}{!6Jur^&yCW2D1CE@Blgj*&kS
z3A~*Zg|a@URU!?8B+>qx9eVF~Wpi~Z74P?xe)=w(HMXjKG1Gp!;Dzze(sDGTZ&%QK
zyZN%Qig~1S`Jq{tVr1)<SzVdnwIKyCCM6I_Tg&5w4;=3VdOCz}>l+KLZFkPjHd*Z;
zVBi*DFRhTm=J;8Q2L|RfSlRv4Y#GKCDISC3VEJ_9ukc?%VVJP<gXPM=WZ1H-LE)fe
zch%L#s^has-~fJgL`V)g{yqOQ7&#6N9$NF^G>$!<|9$mY1ObqFn1LDLsMXPSB8ER2
zm5m|L|CGtD6p+!o!^d_13Zw&UYrIF9DHw+Mt2W?23|ogfW;AA|oC+P~Yrgm9X7z2G
zeOZP!L1z`q9m(#8WOO*o1e43{=6`t+dPWbyyXiu}e}q8l4*u=GFCgK>YUfIzad9^(
z<>u(s0K;hd(^DZ<$jg#c=a*DvWp5>mI4<ygo+)zSl2`$LA@(nO_&IAv`o@%hRa@jP
zCx$9ez3|D%#C)a;^`bSUmkq`kPAPFYDas?ifQCs7v+I{a39-<{x-WX;f7Ubo*xlN#
z74KORS$ldKdo6oF9Uq^Nu61}t!|7D=?tcN-B_}V6q^PTgnz#tp@y`X?iFuii@Y5Pq
zZA*di@w0o}ilJvzieR?<*kx4mzGeZ6;(4~RW9%vpttyOit1NI_ZeXE^*@%c*xLQ^u
zJG4UVDJFKl%qNfh+J)Y7ufv37M7LrV?r3kq-#%UO7R}2`Djty1M^}NVN%82KFLzJr
zgjqwnS$=`U^-U>0R}l&$+BbZY-EarTbaEL49!{mzVcY)vO1xHubk5b_{wa=R<p-Bs
zU%2$!5HU?nqYqq7)0)3MQ#SOwR=aI#X&O?{nE6=T`l}z@&aU%sp|zyzoj5iVL%22k
z^JKDnh?B?h&B1jxH>%Vd$jLig=GT?vdpguX5fVS7MD33ID2h|r1LM>yUsDp{L2wnj
z(SIF&VI=3jC!dZUt7!LC^Fj>Mkg*;X&?lC}*<XHz?i?-sXC6&}iYSwv5M^wEG+!jI
z+bboI5bMYqwC($}+n{GWuxXaDbBTL=0!LcC)rEb(0-R=qo)y^h-WdBS|IP`{SEjKQ
z`l{*<@IO&x3p#Ypx}`~P!}(CyU%S)c^V6`!A78ZzA3h8l7}#;ABpdD_`19lJ+h)D|
zt%B|XGNWrF+_^L@qu%tr_+S2`kJhfGkt{9dp6bwddENc+1TfM%xlPE3IX(tY3x4$u
zL`DdGupxH0a_gCKM_HNJqoG?ab+hyL<`P!+Pxjq~QY1CnQuGEr>eC&>`wEzXtIKb8
zKbpCsv7PdUwmqm$wSLB(#;CQWW!7Cr=D3CR7vR6_@1N}LJ!^=MS>ew}Y5aZKM9v=K
zn`<Tk-j+*y#;-@3U*|)^g3xVE(=Ff<iTLNZsXks%fCd8T(f+_NZ?Q|`H(C0YhL5<U
z4+au3fw`)*ijgbHkgqXMF=3O1RpTTlZnw(LYHvaY%D-uQ{xqk$x?1u{1~hN!imuFb
z*dm5@$}eORW3$=U5ctNmG~v)kQNrtfni>0P*d!(-k0qc9panqN^5NgVsl>rJA%^K$
z1B>1Uj(0iriPmo5cSqRhw=`VZV7j2Jy`V4xfe;QSxZs5>&5X6{xME=9&?f;P+TwI9
zP?{%^;RE~;jc|op*3Pc!zOxg`Mi!n{)Yco*7>j9?ndxM#znGL;eht1tQ<<&XFU()i
zPE=i3nTi#a@}@1<xcvV1)?m!QC5#|u<EWzbbYDd2k(}vdsXqHepC9zZzsk6a-o)l6
zVVF*DoT7wD1i@>-+ZOC;+8dS6>%2bE|1)^b*ZZ|GJM6g%_1MR1Hsx<PG{N*qH?+86
zz5B<7!qw|SOme_!aW7{RXf+LAwvT~)-gf%?m`wIs2-ty*U;J;7PM&zUb_n)~&Mi<a
z@@IF~g&XZr>1|&%_ufoe<|@SgKE?Hm$*R|jDY$f8s4Y`1smAhk=I67UHaftGM(%M}
zk?keZjNHDxSv^_Nw{LH1shD09e(I)Pn0#5%KZxd4tgz*)jJ1rwL4liZg@r5N81(3v
zMzT9=f|Ca8q)?dUQ}Nd_p%)k{R^%ZSVuPV!opY|GklHQQt7}*9@E5@3vDll@UtFmq
z#R~Z#1@IAs*w5(u@mKKE!kb&}B`6*L1(622gF3%e+}#W7x4u-C#*zT^<st2LFZx^R
z`|S!;R~BBAZ%j#j{x$_Q>u#)yljKS2>0B-;1BP<M^b;E$(=-kh6T-@MQ83TrrD1O-
zw+cnRf~tJ>z+uD@_wLzrKggtbr4fF!kg%?_6VWc(@u_0e3LnX7cn$f`plna+-&Wg^
z-PzXp@%g{J)3}CJkY`GeBCN>5AI3`hm2z(Zgg1uK3)C1+7MiS=jypI+cyp`ig3(;f
zv}g1cx&JDmuI$&6nb%1_H*$Cz6HTndSbg1#rH7pef!wc?b{1QPod60hGunP7<W^1T
zkTb1hN{ycsSK{k}MUO@KjsYhe@d2uG4h4Ao+0M7RHhR+<7&>1$Fqz)*a(CO%k9Vn?
zmnT+<4y7WM-1mKqK6E<Qx3nek`w9iJM~6b9ii-h4hzje)yAVvC|FE#2^!DA#^1rnK
zPzCpR_Iong+A3>n=<MM1m~m$TtZOaTFxcMj5H|lhh}LUV?9&=PnNdXp^o@?HEsX#a
zuH*@O2mCrx$jO;))*yhWP3W04U!e)uF*1FrE<3E3IU?tR&}Zfk+!15OrxppXI5Al|
zFh!YaSX!`RZw1WE&->fZj<hwjrI{IP%LE0xen4j9%jY=0$a1abJeH8Y0&=!v*e}Og
z(vmS2S0%QoA9T``kB-agEu;nJ5(69KGt6c)7SBeb=x%C9(v>)D{h?m`NPFXgMf`E0
zj^xMTJ`OvbNw;%>Kdi%QD{N(b4IA=>%MK<Vp81iX$Bo`+=_MUqu^CW9WkZ+Bkj5gp
zuxp&&`Qrj3T*aS0FF_vS>OaIRrdWP@KmMX3r$v|_#s?u4n5$Z(Y$b$+f7x(;%AWq<
zD~xZ+WVRRpW@1LOn_@!RU<I9rKOAlr;)sX0^zin4-bKGmT@onj)6;c4W<Eb<e!4KN
zJNm|7JcUF!p79m#z1<vgevo%M5O&^4_Bz&+ntGYl11wnJR5CoAq;!5Vx*Nb%EU!P1
zc3(YNIh?LWmHMF>%pS>a_=vY*mOhB$*}a_igAj-^B|}M5APIDNk|r53nDc+ddFN+I
zN>YZ4jKZ?nVIFSv*k2rm&k^!S&G0YQhKAoR2?Y>?+2JOV=|#ey$79_Ok88y9XCE=7
zy4AgnJLf;)eAse=vzU(<k2{y|6szF*Dx&((TIKIdJ^ypHNfa|wo$s|agpVJVwF`~?
z^c3X5e=C-|ZZg3?y?mhI>T%<cj$f8nRk!l4k9`Q;6Wig)FCBAVHuJc)%p-d#57~H9
z%bpnus_&Fss+({>_|)%uodMox4UFYry=`r6<w^Q)2jpxX2*HeWy&rU#wlp;0We-wQ
zN~D~5n@O#BGrAs*odx$J`!M-y5*fyBVqXolX})kUL;6$~RMmY~BO7Yz$_l=_dJkk}
zvMz<~8=S9)kwM)|*ul;zMl&2L03QluvR5skj$q6!alO~ts^w<0C|OsvwJ_IawA{Hn
z<_FS-S8BLz^9Ddvu`uoX|5H>Mlap@-syV+NzX2uJUDem3#k-*$YrdWxlHE||<Z{r`
z*Ub4u7@s;v?yTm7e8k79r-$6Oy<eW%)4990mUUI)wre}vG_#)#{42+ch&S3YI)uV0
z&L7PNLRZw1)Mv)_{XzRF&~KGOPyUf4+JGt~UT84y*);2M@KI;sB5yF(ink-Q2D8DF
zm#g2iatKh%p-kiXyg2mieCWpL{QPNWQ_K$d5wklYbx7@B@SE1+@32(7z04IzQ#L_o
zW|x<PY!U9Hp~GNXHC><q;nRk0H3yLiQWG&NKYbB|K>GF_j1}=k?AQeKdBf1?s#-8Q
z$Xr{F#{fbbj@-QY9cBCqc=TnCn_O`5lXnvD2&3K+WnMzT6vcTo;|*;0?Dx>vnuJ~M
zx+G&K-&>MY9QG%5a*4Nqk8-bc*X<xXN9{zdw@1j}KVQC7UW$tV_P8XW(6jx+0NGmS
zVECd(lZ1{b3U`~$pT>3|rs5_8ynrvf(EKM?>PdpZ>v5IYan9x3D(NPXCQdU0Z>sA8
z7Pf)B<$t5ZX`Y*%R!E7N-2W_kyhV?pX7Wh1x~K)ayFcr1>HnsL?$vQWRAoR&EvOSd
zbv-Z#V%GRYdp{=aj7Hsb&HB)(-_bLKo!0ja+7l-|dyHX}3|ItTLqb$>AWv~HS51J-
z^_@#2ccGsB>+HWAO}c5YH(m({n))cWH-$b8;r`C|lc#n^1_+cP=jGot_rB;^?gwxI
z`IiWYyu6Iy7XD#W>UIq+ZCw=Vr<Unenb*qsYL9G`#i>o#QK-s~TQVVxW#}xC3$lyb
z2VsZVi)Vkm!s>XBVzQ6h&Wg`<)nu&+|9_mr&i*;&?l~xY{8q{Sb}(Su;wsHW-43MB
z-*(2+?tFqkIkv2%EF4Pt*6Qq&sPg+rKDYIu%^^mS*>9PM`=5+V-$uQCGRCA9GAS2$
z3d`pG<sqOu;Y~Sab}d;je#Wtc5}jQH8*^u=G<Kx+UBzJ*n#{6eY2MGE2vOPlolCg?
zd=;s93)=bVmU7Um0WV*VnYlUZ+dS@Hg9_xk&1G_$B{tTPuS2BRjPAbr+Rp(r|NWB_
z_eQ~$QY%5J+2CVA_jis=f(is}P|+wZm?u9@50QKm9WLFugms|Dr%%ktvVWZcnt$%M
zzOqd3g?xb_LmkO#U06L(IGQhXo%bo7@0@%%8$VwQ+vPp^4_IG7ney4*&pYq{%>-Nt
zsu>I62HDIEcHR@<CwTAMfGd_H%GDtuCF|e8S2$?}vGAK595k!lC#awrvkxTT{e3NU
z;TgsdZ5QsI8I&#{j})i+5Mm~Ea?#qmblDbv($*N<>l9!C&w^d>{BJwo(ssOM&>;v8
z3u(YvVC(mzuRTw>GwMmiib``qT`Ps|XWOVtNnFqleHQAfhl~ZGPz)otV@V;^4uw4z
z@XLJ-J2L*i_`?PZrUfl^pGfw(#rZ(Zt*q@_Hnh4d8OZ@HsYUwOGRWUxHTwei9X%Y1
zVMhqP*JxkGVZ137cI0+r^A#|iv{aX#T|QWM20g8mP+;%NP_!jv3^~`gH5mxy>Vr;7
zBC6r#=ZV^;?9}gv!T!LytQer6dDN;Fv0ZdA&6{he{LXNe2Cd}R`X^mTUR4|^xMmSH
z0yL*JAO1Z!2*1Ty7#qUUCsgBSPdzt+)EurjL(|NxbiMD;(>{s2_r+<EIb_aLTJW*6
z0YQ#<mOa#oGFzp)ya`va{r9kb$~1E;i7%xJa<yP>XGW?}L|{;uA<K!Hi^X2AZnJ~-
zql|d|U*`JWE9^ylRx|v;PIo|k2&_-W{p)#K($<v>B%R2Nlg-D|IV7aA>HNTR;#0l8
z-?@?<{&Bdfln5^>BxkeXj~-n~iWQ7X;{!I0^O|2sSS}-hdPktljlQr<{wY$K>gA)r
z>%U?sLIw-<<wOkk7J3L|o2g$vJXeg*o!M)5UhF)J&Bu=2Af<v1iO{pdjHEop(8ImV
z9;5dZH*3O6u$k7ycS|Z`u^Qw`Ykecz6HV<t9|V2bsq?7c&)q{O7D(PgYaqF4r!5SF
zVa9SBIfp9p5<bN2{`Hg3yLx%bP225bomg?QWS8w@6@E`W4J+Ex#qhnUGQoS4Z%a}r
zCgHJE@Hlos>*o)xDmUpa+NBK)Y(+$~RoMM&JVIU0O|VbomVIt!<#wx_6e`)N_E}lo
z*~rP%-Wl#2I<5Ax8okj=q3o6rwXM7r)BdU!+98_=|Ah_4N^jqV5wAf~`1rb~+%il?
zg6wX4Bds(BL?eDc+Y4S&JbiNm_A^FLw~t1mbHD1B>rTts1E!JA&KDIwt(!wdJ&G(M
zO{+(?ZzuXFVr=TB-CtqLpE#O&bSM_RYQ&+-BQ}1iGe|N$d)N9t)j^wZ9GBbeVzSKg
zE|$)%ayt1!$@ys5j?#(UdHMN%*guK0l^7XDPz3JuMjX39k&aZB^X=no`VQmcN$ioZ
z<Ix!kk+CKaSGHW1!!Y+EqL&~ekM7@;AI`c8qJ5;F)>cmIV45&Sq52CM|8c9al~?`!
zU{r%-6QC(9?(~gVucJg@u>q`iJvjO0LG;!}T!U5H$-Z_<#;Q<($bwoyUCjXF$lH4n
za!`is_Ujknv9#b4?O$W9qwTVh)9~#`*(Re=+&@Hyh(q&t*f)WM({^YZZ}Fv<1R~n$
zhJkS|_-@FA*eQjHpZ{Lm_B?1i8z*O<CSsMHn&U?i{79MoFFvx|FUTQz*y;&+JrCS{
z8;9i5t>a9ll$!D%>%R|v|MqWc+dd-5Jx%Pe2_XOW5T}M&5eieQbY`~?d>gdZ#=NvE
z!;Y3?CMPe5i-@TD3U$qU*34fS1loM}PaIKIU6NXr-EnRklRZ>D>m}2qN4wBd0=MJI
z11A8;b70SW?mFowo%W1~a)fBv%xwFY>O_7WjOkqd`xlRQ_V#X{C>N}oaCHNN=UR?L
zp-U3N$Ayg_9{*##o+|RX<6BKy+($|;w;<6t%<v2-1ehER6H@i3D;0hv(1d|#u-;uX
z&rT4f9^hFt<ckuVB_FM}9=5r)95s%o-~*$T%4Mas#L#m;hU`mwj)tz2S%;s0xDiAP
z(bE4wPAZ{<e8**LFch}nijG%<7~!I2383(RM@P(NF>Z5tO`SkiBi<{OqTw^G>SC7j
z{S^6D?47`Kc~y2CrixeE1*ix)@AIQrR&Km?e2zSPinynEFXU){tvD|<Ffgzb|Jf`-
zLv6JerY7LSA3(*zy4W24d8QTl&QzT$8K?O$rw9hEZJHn2&bf)pv%BTqdi-a3K3ir)
zAi}nL;C##LJTB~COxtk7#C<OLKNR#Cubp&0DR*{+L)KH<DX*7MQCPv8G$y;widgPL
zC^YQHoo(W`8zB!ZqqAf_MPR>waz6HFZLp=Tw?&&P=m3nR<iEuNMa!Xl&gUBR7~6fF
zg$v;xQiGf6PXQ2x^@j-FPNhA_pk#F}wCyrKS7t8yfd5I&z&uVlGBE4@ON5+(dg3d~
z<X%x*9kBuj4LXV!tmEVQeS6Q@&l>a9%(^SoEy+)W^^alY!G$aW)>BIHVP520w%y5^
zal*{$Ra-5L(wj3mjA|0vv|r$ZsuVBCTGwgAS5<G>aMip*E8Zuan~kJ0y*yz&I}AGk
zPv&{e`9|IO9%v;#1?7D)yFR4_D7Vs8w~vd(V1~1yG0q%u8P9TqJ!G=$GbfEEhw&dm
z&M<K|c$W~x(SY}E9t@-j2M{l5qIIl|_DJjxsiDCXVntcygGzQ&EgOSKk`xgK<EP5(
zh9x(nPj3b6gwUc1m2X7W?$@`pHbZWu=t%mhg@pwvB&_WIdDdW^o0o@~^_XPrUZn{q
zt)-xAZ_1f+L@y6@{y(ca6WIU@tYLev(Q;izrmiA52wg6$bh>v3qAMo@%ho`7EHun}
zmcJzq6pkOP3@fE6Jz~D9yJBH=EjoYLQloevK>phKd|P$}k1h$+ac#SN#a$(B7O6s>
z$|W8D-!I{3^QN0bEY?KVKVHTSAPf%JpA6z_$Nn~L{|=D9r*v-^U8^eqiI=Q3bL*4a
zq57Q0<=ET+{>j?zbrHerdB0pzaZ(;jDz<)nz=_F7bKxKu{F;Dpbd~8DFOK|wMA0~^
zg(M!}Tx*bn7DWuzU`3;?+R5|Pvmk2z#eJGqu#m;Lo-JI2sW|+ta(%Ol3Y(Xy4P%@W
z%N-lxWf>o$;CHIl+F_AQ0avZ1GCk>qd+jocrjY9Ea$;YS5>(tGIjSP^@Aj~r{kq`y
z*TrH<Usc&+-pUNoSK;_F-Kk%u%2X$+tNf*6s2b8{pgKH|m$TXume_!>S-0DcUbUtV
z)%uCR|3uo^GJMQ_1M0<wJlNkcKY?gGw_u5i8XOLWMNhb>??7+K`|%t8vf;Ak{>qr}
z%<t03^Nr=ddnYIRjA17$j7ldbM|;*9P`-f0;`beqBMP4yDA@cO^t4LK5&$5L7?}H?
zi)#(n%R5Z=A`L^Y{}ALZYIjx#*0<@5)Z7ZU?k4plDdCjPDRrUJv7wyFgReV_o;s;h
z1sR$m)u>q^sbCa+_5H@m)6U!8D^VPeED~DGlplrhs%mc4H&?6sb@{aIX_@Ceqp}#q
zP3rfb-2M=M-3YJiZM+1{r{$0-xO?MMET1~hCHKZ#x0CxnNvmCln~3-c)?iQTF}YBg
z&R1?jg{g0{D3s&BJx0(|d*JC(u(jhW1(#;k?$ltJ^6Tn6V@Ldbw}P&GSndj0G#Hgi
zd?(gj@ki9R0tgXu#O7)D_&BA+cTq!E_kOpC$O+t(FMeAv$8ja2n$}s_=YWz4mjASd
z{J4)Zhxf>Slw9_zxKFO}voqDZfdKpUOgP^OIqaPG<Z;;fhx6BqXLUrx`Ol;<1P_Do
z4B4o?L(Mza*_p;r^L^QKZYKVX<)`)5gmz{KmxzhR_gj%1u7c}q%`!^GVk2I_JW{n~
zYiqlnl2^<LiM9vqICUxg*H!O`)ttXr<Z{G?Og&SO;7=9*S$~=*o}SL1tw%!*h=A2E
zKr73R%y6JzIF#{Pe8$^ppv0Sa(NixI^r0&4FEmu`kUo8<g48#)o&O~-9xs6q>|4>W
z?{XO^9glGkx!m4am@C}`&2*J|ra73aZ7!Aa#QBNCrR+c3Lmr!roy)g~Syl?oJVmmA
zyi%+lPLj$<(Gf3eoCk??Ju&>)4EYo>OawClc^h$d(kl>+_-37N`f=x&^z+Y3k`h<P
z+p<Dw&$~syZ;0&2p*N5I@O#S4BKxadAm4;v1DX<1OT(5?6Ly#F1I&M-;%*gwf-9#J
z+Dszc)-%c9I^sAE%#9ikG<UmqEeSiu*kW3$v1h&|JDgyJwcm&t6SPJXTr0I?>9YZ4
zrJgBJV=8EpJl{6KU=9;csj(1ndMA&S;}$g`M>SK>LLwblAAUTyOoUlSL&nD8p_TMH
z-U4xNBi;T{SMDclN%PR}TAA5bZ+@9b`?TFhYm}i{!*HMNGT!j}x6LZ1OwDej(N8y-
z%1Qgnm3kRoU~EQdB3?kL;Ar|V3$9{Ht4zJoaU!2#>~gH9D@heGIxxD$pC=*k);Ie%
zI8%%k-!@204bMi4{uoD1cPFT*qeR;MLNbmNziQ3%xx{?zkt*4_`ZdtOW$kcxH|hLO
zOj6qkKu24<YJ7bd|7W6}&2yX%e{Dhsn46_j<$MTNY!-{fF+YRl31~KIPkP~8L)Vbd
zXWVE`5%i1M?53vdd(WS~E;`xY4|o=O--LKg{?WxNk;=-fkyKy5@Ow~x*|>I7t8}9!
zu*@Rh6^Um=f!CUw>#?CU2gaTt5$2)-H&pRWoO6(_b#L|M-27Ws-0f9T@#&kk>9#6L
zTa+I%NHTHrS6^kbAc%`xSeT45`m>PzF?>3sZl=Nmx$NAY52>u3nZd<jpE?q4v@a!d
z#rc&Nnk;L$eR|7M=G@i+FL0<1%wVePZf-`hocws;bb)T>Npwk>+~Wb@XGZDWj{K&0
zFK@Wq$g!4x@V-o($-#oejWt^5qN>SS22+bm-rIPUNK=hh^=8U7cK=b0O=$P2Z33zQ
zF7X54gjvK-+=J&DJceLHsKU=TWX|`_cRLr);_AY(ibn#L`Rn`t2Im?|`OPDS)&CvL
zTcHan!(HBh6B_6*xD{FbPZ^%DGWFMHk&Kn4S5sQ5o(5A`jMw5&VCcQ(GF964i4>bj
zkE8Y*`ZcC3l1hn*C9x%>F_7?<5bhY4Yi<ca2L(B{vMv~B4EBD!TMD`uh<RbtJHWx<
zDf8lr?NHFL=n7aUU!Py;d<EP$M|L0`3|_#rdsGz^5FpFL5YyTIxz0G(I9}4Pvl>LX
zNV~ojc(^KU{?;?K=h!FQ#h|4IbhQWWL`5~D35;so(a)h-4XMHf30Ap8U}4(c){Flc
zteCL!gw1Y?|56!BrxK86<Z}Q^5-c>JqA$Mvc*>6uBe<Uu)dCtF3oK@(hIqpc>)RS6
z9gE%I9&UFd+XIJ(EH8nBTi~;!y*-~PPidWphYs6XY!iq{vuwT;W799zWw*8(ol7b|
z8<PkzP^7MO0B5;ybUny+&sN%O!&R-Ak~t_Z8;~A_CQS@Z2l2M?TZ#tdb(MMYAKRcs
zd?i!u;QaP$&x9zmn4VnXow%F-jUhvWf&R1lSICqCt6my@kE4>I-zTU`$;!F%{focN
zC>~zcp|O#~w!fH<uA0(}5u%B^PrKwYH1aCKMm}7Y8256WMP2)FC(GkEizw;&E55ka
zu>;w)Y`v+#^X27GMAC}Orho?-%%?-JBSHp5Nt($*Lv&4jcXtQ<+k!!)bJZ8?-WYK|
zXhHV^Ss_nab@}k{-%k?w_)bQ*Mpu3Y2scn-HKD8?!X0=v7|gR;c*<hP*U-E*k}oUB
zEnf0y$$wT1ViWMlxvuwl{<sf<N8EI7J+x{5{pw5gs2<YV?#km5T`d1I-(0+$@-odv
zmDHX_LpatsivDuGT4X{`f2BTiHLa26*yYF9FQO>kOobBW-m3Q|XK}uK^Lhub*Ul<d
z-RBd<AiB$xLFvq1^7m@Vd*O@$PL&d!&r>C3WmTT8&h8t=du5I4pSnukcC3%)|9`Wa
z?0@L3^mamlsez&KV*=e99Y6kqQXQd7m*YSy=T_mZ*)HxE7=A46$5hn7p)P?l?3<N-
zY+v=!(0XJ0Z8vohjmf})y~&|Z^S<qIjtfWj!XG(S5W|FqB}9#x=j^tbdSZ{GcInQ=
z>#Gy%KUJ$`X<u*R9{UguOZsgVnY(s_#YaJ+VwRlqG;Se@$PAmCU;Mhk3W`Wn!FCBk
zn@wTpqfT-ix&5NiZX!d^q(M>=?NTprxi$fXt|cT$mM1SS4+i6S!&qs`>RCz6`^zPO
z7{7Nb@m&`G^z0d!jRD_}uVOMN%ks}%pV{0Epd*aWK<x?BGyck-V45Z=I{<++=?CYI
zz^o$K9qDJcoohHV*8&hBfx!T14Lm%E43{@aF+fidfmnB=359Vm83=75n3)3tY^o{;
z5+l#86n0`_b$n`fJ`QRu4VTx}qG+4M8<y0tKrIBi7JyQcg6-K<Hyf)43%63r^hr3h
z?alS?wX;K2vln@h>``0RWzXxgx4&^26fHI3TFAoB<nv5Yp-1^U6!Fi>%H-HdUCNQj
zPn$RD0jli^oNc{ZSADgaQl8ggiE{1lA0U6lMmqQYz-g+QFxeZjjNszX43H*ILsMti
z1j~zQ%Uc77!Mh7RfUw}^AuLOD|6Er(I2cbS`1k?`pf9vEImX69Rj1r#ica!Cl7)mA
z5-J??E1dAo6`Mk2a6(mS&$Yj6tJK~d?WnXE1%(i|tIlnn3~v#hEWhB?2)!#dR!CnV
z2>a7$fI7qMme_6Vf6i3O4-dsBO#Uqvn`Jjo??b+KkD}O0cUKWIHyn`Wn=smhQ&@_O
z^dqf8pL@|pm)V7VEMNcOcFEMnSblOo&c`vF{*UkxNp_Ja4(VPx-^46}mp=Nw>Ru*V
z2rr*%Ryd`srJSpbzsu(AaiuUYsc>vmmNnyhckwa}QVD;98Wfl-odKAFS6*u{l@q1<
zQ!L+rQ}2Mhjzya3#G<wl8$&@vcnL&D<nAyttdJvG4@}`Hyqn`I34Fh}=_cnaM!cf;
z56)^0cp}TkhIzq9y!%etTWbT#EhGbZDjvJ(!0S9coj6!H?~~kX*yb#eZ+zWTHqKr8
z<Br$*?)BU6V%IvdA{<T)As)5A5AXE<#D|Jix6AY{n}_^RHL^Y_GG$f$bN@b)8eS}n
zm)^0VRWWsSGM%N>sno@?2{>hlnCB9MTdl$HC7!~1ELO6+JHe`^_*PPe{P5|m_o=;s
zJKWH9x#7_<*Sj-ZOfKdAF9QuEsbNOY>-+vW{Gh9CxJI&nDVq^4pEn$RdAiuFz;|t3
zpg{6&MXikvSKNCot*QDn<S&d{Wm=#=;bqx>J_NrIQDF5OCaH`2UZYDMyPs<^<CL2=
z>kp<Fbu*i?xo&RHJF4g51>{yS=H<2Zxzo>qN9D(Mg&gRi*d^6DPk`15x|1shE|;TO
zmdzxa|09dXQQAXt!;tD{D+-rYYxW;8KSU@hrXvSw3V5U4s5aJostRWYiy`enyb&d_
z)oW?)F$tFFAm04Dtei*N$K;)%(OSH;?Rw8}5_pgf_kSAGy0o?R1%`%(qW(S6g3ALh
zqE;zLUpAe3*3DBcNeu&aB&L(!Z}(|1{$x1tB@4CyDpR^6Q<=umv8)Bx#fpN|-7t~l
zd}qJ(YUA%2+pF4!R;5HHHDE4Nnp^A{3p?8TH_(%mF(9k&=z-rk-cT#mFm<E_v9bZ~
zMRyvwxwdq<8gb+7Z0OcL9zVJZbmXFf7%1g(ZvowA^X1JbxvzhgPlI&s+{%sza2n1Y
z)a-6Bn>{6+B;XoZ)ux&K`wlUxpHZmW3exZeYmuvmP~lG4X-jo*;K}Vo<E%Szd@0kR
z<xk@82VLsEeYd4B@EcaEw2PL04(0iao1y@tpAq{gVBUOpu*0=}20m4qwAEe-Nq^8_
z(&R<Mlb!warG%wdYti+vD1Q6X?mfe@%^c(8t^P}XoHOZVAKhn^Dcpm+ZKfo6MdEnb
zb7`v1n^BiTUogVO14j3rioB|P*!5-1qioSc#736e{1%JZgbp<#HZ*r2Y52puUKsPY
zpynpdgV|PZvC3Dd1FCW1e%27fWV1o6vYQa?4E0V#&(z`2mHlJgk9Reat8_jj#pIps
z7iSpcJt!yB#wJ*rhL0uJl>Ndn3&K7PrI#@=p*~61nBKOCU&7z5yc@Uo8_0x5ND(wm
zvRR=JUEt2B=hWP`G((!vV;5eq)H7Z**{`j8ZQ0H*mtDP7ra*&qDIo+;Uy3&ZjoKQZ
zE-gt}xZl)avr(TU`u*S4b)wI9X+%ypLtp&Es!FP=CP~<R>%))L6^||%rn^{a=%Sr{
zpX|pyhRSKu^o`9xJ;+`b=C^XGAY({YJ|aTTv0SI%JvQQkvk?@cdz#|gE7}gnv`7@<
z42sR-7(dE+C0^~&Q)vcN8yFHKrHB*4u)?Z48L0JjbDE2CbsHHU40@5`#K1GqGQ1l=
z72XP@&G~WCq^j<(1LuuIY`33uQ9o|r(8nk}%3o#7_4NdnyN=CRy#NRJl|7^0jn>w>
zb6m+HXa>c>!U-fvD)wnC1#NAU9<#K$BXvnSE(pOjv^x3kfVbb^9C2zIzkAw**|=G_
zyzU<+&l7sIthsV+aCGd={yevFp4lHm^=SKzV}12*??FLKNZYIjYy<SZ1%@h4l{-x*
zEyQeZ|FAd*7NEn?VvuKSN(G`Q$Y;BF+W$Yj`Qbw<a!^h|6EDrGCj>?{qbnK18Ki$F
zC&3NzazuMR;u8)U*tO#6h*n*!z6~ZQh*^~?5_cJr8KWWO1ay-`BB^2LXxm?nFoaH&
zyoW%X9Mpy7!|gzjbh(F@tB&IYN0Lgko^hV~<>l!r20(kt^zCs^!~~0;VlY7%^d!b8
zl9#s)5gLl%X6ONdrBO<zzb!)(Hj{-cmh{|A3N*mCMQ)cmysyNm!;0q<VK=U*Ohsa?
zY3=Lgm6{vK%Xu?VtGLVUZ6U|MJ7X9PhKIw*m@x95kCqM6$mI&eCpV%YQzT%=G1du0
z-y>F|xAoQ0xfWB$z`6vI;X8x4)>VWyh;4&3y3Fxa9m40!e*@%!KP-3nMz{Kxsx4sH
zj<^DyS9S8iZqORj8h)6I^iw<Yo3Q+&rap$B#gm+j*tb;8BBPKLaTYxcm)>`odHxi_
z&ZVe2|9v<gGtCT5cOGlqvJrdyy3o-Y*6GFFKUv&pHZC&El~Dc>F@cX4tEbD%JjxE?
z5$C25u~^4`DARJS&{|9T*GBL2aQhyFDIaRql`+F3vBz)3f_)B!vR;ys^|jfjdRsx6
zkY!~_*IL+J!l1_9LgL@Yc>M<aq4ne8*YWt@4)H=DuHKBeSMYhM(*JV)h3iJnXi;_d
zLAuRvuP?$Q?|#;(68S!1!t%h7E&TH;BH=aTh#Y(KV~;`8NC<y7;MccbRIRjbYi>Q+
z+aH7I`40d?wO{*^D&L2)aKbB7777zXN>i8k$c;-T?ayJBtk9dQ9qQs;?aYHUwa7`~
zo3xah>_>fNKTvjFrseL+aY8j!&fm(bo;GY&pk^0L9yMg4%`R(88XL~SS(~n^S6h^z
z;^+c22N8)YvWW@VRs{<$NVX_rIw3(~Z%J4;6I@l@mDSa4^m>&P<nEyaoFbo;n^iux
zvin4Ym4u@A{xd>E1bubdYMosKDD+e$XkB}B@^5AL-~X$L4GfY9hsVc30&Y`f6LOD5
zX*y{PieFnYP>Sxk!20k*#{A)#Q`*-8OxL`dxs!@H9Dg-(FvGf+YTlWxXK7hL0lU&D
zCvMd`_dgTyhC#Hm$Ljp%I<@*(;pfF2t32ou`RFt(<mfmokZhnz$6plKuLRfzV89Fp
z0pVRs?Wn9F_V4!l?G<bmYlz%!)&!wK@t2zciWFbKT`;tbypX3~Y#o>8Z}G~NC)&Pl
z@#<v>zxis;K~_cfoNi9#n9xzKj1+Y4`F>~8Kf6<#_;&hvJ;?6*@{sdXbuU715ZBl`
z=G^Q$tM;Cysp?|*z#0$(egTAme_^W3|Gj0ceL!2aJCHf9V`Rk2W&lw~?N8WoUDz?u
z3>0&G2;i^KhrA(La$(dK|F)2;Ocb`LU^3Z5r`b(Q!STC)|Ll<r*p0*r$NS+wUMwD6
zfgaejO}odm%8*wF)&}9~Ca!(N@2J@w40w>tUGpySXJ|l$!lRd3Fl9_2ZMA$#UJSF-
zK3&Vm3q4%Ru;*AnAS|BE{a-$gi2=qY!v~V?zdecn<LF!*ng0JTK0+h<PzbqB<bKON
zu|$c8x#TkUyX87^$t}4hM7iH8nfo>7e#xcDU5za*B(YdbExGf1eSiJ~Z14B;d7kGy
zPPiO4{##mr^AS4{xO*p?ucJluTpmnD9qu;e1ck<2a7~r$W!?dQx|^2yv~jGJUtN&g
z`_Vp7!=ke>?_O(m1>O{~)^pG|ksOvSVo%3wFC8btc|G>MxR*0-ndRJ<nYUph@<!~!
z4D3%om10uu&D6Vd_?I6>;;o)SYW+%+QDH1U<L*YM5f~xDbao0UA9Qaaf3hq#uH*_0
z>wmpD$#XJFX+1;qiHF>*C0eRA3MNZX^1|QOasnhfSN8ZdK~s*Dts&d#q}~^*RT6DD
zB!8(nlsh(|I$q*u>@VuF#bE{_;1&CUHdyq#K1RQ+z<KlRzy7fg&YSLn=0Q-Gp%^Ox
z)u=vR3!t(Un_R$mO0ZWeNePRD!&pW3GLn_Dktj<mNb6z)bMEv0jv29UGWr4<Z}@|&
zy81}l0+y?T%HUuC+$m$8Il59<6|>)ZDYXNiOzHYHwpfIv33sgEIlns<OXD=5g4r#S
z`{D*@Ntv?{Ud`Z2%{QE}aS~8A;ZGqSu*4^|Ivff)n(d6sy;kz3pD@+=^H=4*Mli-A
z3@6d40<r1KUiX$=gjk5C^vaupYU%wKPwVOlXpjWHOni74JejXi{WF`{;D-;WhWpVm
zj^}q}RgBg3)h}>aw_ewpg1tYzzRwP;R^_I%xbU8A`=QJD%fi>qF3z-o&mqNgt9@G6
zK+@k3)wjR3w7ObOy$bWxlD%=3E|q;oW`BNYGWluAPJp`br-|sxY4R0sWKUIqGJ+~q
zxsddbO@zN7Ej{C6n^Ga^O~Bm7KgDpeff>439v{|GRE*}BF8olKoZ;{iTrkm#mm!r{
zVW%&Dqli`X$yvW_tKPXxZ5$sv=;3H$@v*VX>M5i1^Tq!~^9_3!)M;PS(SB)inH*_Y
zJm0+IV2McWnz(8CpFD8pK&ux|_JJR7@ddhKSuvNT8pBB+tT*TZoqW2!7F_ui7Y>NG
zhH$S514Hnw`SpOr1Kd0SJ+i!_AReDZ578vI-q5+9C>N3o#s*%;{~B^>V!Pn_p%}k-
z{GJMklt3NaqtV7F6iUCuC3|AO$H5A%Lg#Q$2Pt>H(l1dv{Ibla&$kwL1kMOKI3{oX
z`<KX$Q0opyFY@U`himeuLo2^M8wN?ZNnj|wt@l5P@<4zi1o+q!oz?lz){QjsCcXyP
zqe(o4BCX-ZP09Wr#*FGO)1ZNy#xNwemH-^e!PCWK)!ai6BkS(ww|JBf67ArZ07H<@
z2|0##q3>v-0?ZxP`2GAJ0O$#})w>`P{9#EYsWx%6w$Po;;B%JbVp+#$!=e{&_o;F-
zS#bzV<%$Z~)vUtS`E6A77BO;6)EmT5#hZHk6;)LcRF{<GvyD}T(B5p;0xKUO$h4X;
zX<ep5x>JJqLCjjy#Xcw(lFHu4_f2ka`niKgz_iczV8-vPFZHk{d={1g#+As=%DBIc
z_PmlGLV}xxPFsIvwz}w0bLpN3GxCDCqll66K-snB?fjqEzp`)gAoX9!`bUvH`Sqbm
zM(J9jP<pxyiCA?6*`{iO%VJ3R0mIial?Oli&pN!pzBYG;3(w$YGd**>r8-(BwCBs%
z%0&>nD{Et8%c2sbQh{vi_X%5+#psi%<5hnbRzO3c_3clcY*WUbvS4%a?+q#hLENs|
z!tO-f(!;vQ(pQd_#>sr*yUM`TjBM03zM=MYak2Dqbx{7;Z-pF0S&q~%cw5w$%|yQ!
zlUIb~3N9*hUxKtK!661?s)krgVfss#AavewXHAROzF@1~3Nul!CEqHz&fG*_cjahX
z%rPGx(>j1udqtG7bg-4SssHHyiHUJn=jDn&?lL;;UII@~X0Yy7{LJTbUE_?}PAS#X
z?V7hBtX`429B?Z5)&zR9b@iU-0F^*ZD)k{Q^|7OM<I7*A{6`*A*_)mk`Q17rBf%P;
zqj=#(&MPTpZeTho%{m9md2Uyu%B5L`*SbO@wfU#jni(oKHqCf!-_`>Pg&4d*E$K)b
zd{*jaLC07Q0%7>$E4CQ8GL;<Sc5VO?hk4z0fI82W=}~85NO}%L-SQ^F49Htkb465H
z-cN9!3*V>xRSW9|Dh8JTb)@LOla_<0G26>0G@ZTq7@qF7l?;vx8*^s|O@sDMo^Lza
z=U{I~@Njc9kXG}w@@-aYrb2d{JYpY-G8)9Qh7XIa+4h<1z&|DA=&_V>!><d|rLL81
zuB}aF-9~>^ZFoAZ|Et*2*s@_e#L|i|;`%RI^!t<HtXJ{M7YiCm8wKSc@-nSUda&Lo
zSNi7e&rT)|OCAPD6cI=q7hRPL#TjBAKCIbu&<l>#UM=GC)^k0pc2MvA@-DFZRs@!$
z(c0Q}4(4%oD2x8&WIZAUqM&dOplq8z?qPkq3JZ^%+eAyVnlHaZZ+2}q_I(^~z}%tz
z?OHst2cbSq{RmXu!I8tQrCutj{ycZz->dbz14y#EH2r2Vr4zUuEEhu*aXh)MQ3d5=
zdtf8Qx1?A`fW$Htz$Dbqj&g!`4G7Go+SCTwRDW?BvZi!!*JsncLgik=gXlS06{l{D
z*D{{`Oe(zS(TkL?K7XRiC;4ml=wPRB|Aphxn?7)=KiKd`%aO-MY{}h}{Yl#zW?}5O
zVVu~FF>NffZM=`1UN8XFg(9$05$Sp&@vW5oR#YbwT@=)wDB@y*;%nd9j!2`8%Rt1*
z_gN9dI;<g=nfx!KYkb_ivu}lMU39r2#-0t^$|sTOwWAG~^+xEqIw=7p$A>8ve8^oa
zfAL%Em~Sa|pA3B*Yd0N}{TdrNW7!=IM&;jMAXZnn2Z<@kX@!YZepr7pCH+@ot0RJ9
zZkI1A-P$gQX?Jt>=0!TR3o96`79i3Y+zrPQpijygp+<CWaCSG3OY8`Gjf=Y6k}!CN
z^xeUqB_)H$mVJxw>0fI1p3M74A07T3DrMGX%&gv$YiV%ki#}QnA+=2If$oR3y|oU4
z{>>UI8Kku`gV{R`Qb^B#4zky_dzNesO>ovslF0Q=r(ddtUH2Nk7(Yuyj1az1dk-Qy
zi?ojw__EUc9p(iVy1_sjtP=&s;$aDTi4>IFS=pzjGLr|&q`y{QKm9f=Sdpc&kq2S}
zty1)Y7a8bEKpF2jtdpnohp!e#=;tuojg(5GEE+IVZu{E<HyLi=t_nrMVPsZ-9EUJ4
zqofL4S)GbP2pX#?1Bv5Wx3~_@#QGt+|Kl4h3%zxxDyINlLROwSPk-@-jEn?c%Tmr>
zD1*T#Fc!!DIe*VK+$u-(=iil6d?=J;Dzwx7a!Oz;A&jcLd9)f}pjcv3LD{Z`xl{Og
zYn0#fxB?6PE%#EClp&buD?PcBhw=<12?zC*qGn04i)_g=h=S=X+m>$cn_?VQqL~VO
zt`^jMbO~+^^I`4waCSof!zKclm+er|(I?uIFPbI~w&WL&<d0wW?Ew^A%x_1PG=Gwn
zy91HJ6)b1Mp{H7SJW<$Ye7@X!e|HVP9kI~~klBKFFWc|>w%NI+)?W9)<!wB`@7PN+
zviBYD&)3}B(I8JV)U6OHAte<En>_PL0#A3xoL`&FpUAHeUW)E{K9BrWJqR|(nFr@R
zu8~w8NLVTQ8$8m%Pxh=An`#!crGK@F1j(9@cb3(M!?cR0r@?*1uWd3M_$B~Z_4ZQZ
zgvXoO2>Lsc8q}`VF%5?#Ma*ZFiE?VunPmm7$OxW_Olg2Q%LO`96V6Yy2P?-Aqy9C1
z`hMs|-BGbQF+0itvY<7e3>v>BaYG0z{uIMWdtwrML2LNYs_~YCe70>07S!3y*uK0w
z46>>f7jXO>d+gBHN+;mmojq1a^at<%n8UTpX)5ea0?=BO0=Wkf+BT?2C|uoG6mnm!
zu-v<JT(aVMGcI4vm)Mfl?9oNqu_xc~A`z}u<Oa)G7AD%FBvp1ZkgbFeU%TqmWrcJN
z%wN(53POi?mb#t17xz<`qI;WAjs4o>nDZ9<_M@iY(mKms+l1g4+<Gj$U8*Me=Zd4Z
zKv+ITyYEr^-^o83H;<n>zFkeOyL%ra+&g22Y;$Q(Mmm<LFOAh$OOu3-zK6i^FYeyG
zg_P9?WeaDPEjPsxdi7|{skO;2YjtCEfNGe_B>aPu(kT|+OK>^U+TRQo?dz0P<d6cB
zu991-+wmSX=6I}cpR%KKq>eN}J{7sH>%o-x9AF7F$nzc*5BbtaH)BhmYz-K7S13Ts
ztn*ROv?jN@8kIM6C{~#0TZI{0>8DM&=Zl*ULU%$52jkh9Qn~XF4m@N2UA}mFnMRwB
zM#ntFhml!rE19HZn=`p==EboFXI{N__oHX)iO~2f&27BZ!uPTCaiGd%hRV5k(RWaU
zHrky5ZapnLeeS?mo^J6>$&pf3)Xjn4m<x0q0On(i=erTnBl_C93cHH*vG=;FsUc_j
zy2*;|mh{Cp$7(qrVxc0Eq4GKT+2)rJKCL(&3dFk9LNXgYK9comyLYkkfoL2@7(=gZ
zb)2q~AugQ9JRQeymGv>UmHKd7gCQ32iAKY3Yo;#8Ba;Fn!@{-}gw7_Fdoz5~OqCRK
zO)dAYb(SqnAJlt?oN>I=P6Q&b{uhEMuoHD^(Qr8khWFOXMbf^72;I`ZbKbhil|jYj
ziYqb$wvI-1hFppSCTeZq{$sFp9^7k4S}H&kkiFB_zyVbL#Q$Wi??~t9!bwumi%qT&
zgHcfOx9Lw^=nlt+Z2S&BUaNRHnQOGAO+Gw4EV__9(j9eBPT6<nK^S7ZSiA%h1NnzH
z9y|;UZP+J_?UicwB3Lq;65|c>WdLhQBwp%%Dx)%9T}wx2r6@rf{Ts)71NJGz{z6;`
zsFw_MNBO4yURn~F{M`uN&GJN}lqVfNvpd>e?z*Ks%b*(ssJCjPC;W;1)+7Rf=Tn$D
zRSSkwri_m9xfbB8t1;TzTG6|^zau(uJeJxn#VUFey>pwbgfB;aLInZZj_Jc7zKs1d
zKTikonMf8M?+9AlZgA`~(SX58Ar;Hv=}lBkkL$10r<Oz19VhiFb9o``w7m!cvQ60}
znjr%`tfHSn@xY-~KNzy#wG`RQqqMAE9alzu7(2;RfLmMJ^Dd~EkFGI%CdJp3i3`+=
zFZ!d-3K>AQkv5Kw>ih!g0wK?hI*Q~K^{J1WAMhkD{~j90@y*IxaZr(xD%mbj-}B18
z)GN1KHH5hhaTjXPHYv%qr~8!}yTgua9c=_n9M;);-Ryo!;A(rGiX`O1^W62<v&LHE
z;tvOeY+{t7=AJazb*`Asr0l%i-d|QqO#aY#Cv>dWE$Umg&fDma@JEKPLD|pGD{Jfy
z(;qP_N$#T%&VE>4toFo5JJ<J0`x-0PUNc(t-Gag4kIUnHLLS)!jf}~$4hhc-`cXF>
ziO7?!L2C37rSAZIYk4aA&vDUZ`ix{Mj%S$fFZ~y@yG{x9CC%E7WVE$fF|ni48b>6V
zD|v^N?hp}5`p#pBaJF;SAM#>ltn-=eyXN@+x9{|b=w7NiN7leW$J^ABgx&F2-z~zl
za%%2!#}647OKLI%kv)if#K9)9=cyQPZ_ga~+WsfwpqICeT`yGJ09pr+%EVH3SzcYV
zK{l=DdL-SGS|g*}u6GBT%~8$=Og@g~bN(4>=b1+5@xa7ebd)B1t*NW`G`Fi4|1FeE
zW)neL!vG?MmeiQ!fPZO4pJ}{cG(yv|tcT1p{Zw0;EgbZ?SbI7cDRpnKbsA>peC6zW
zHwJO1rqCoiy{u8Zf)cvmjbVGLV*eZhu676vqyW5w%kwTVTDfr69dzvB6=-{r5n*9J
z2BH@2lw!+_OKDd5JCF=C>{Jw<KGq!9v2d8Ih~a;eemFTNWs2%?sL!6`Sy$@`?d1gV
z=K@b^ZAhy(pF(00rArXPV&ruZW{$e{@+Gj19mni#>m2Ev?40DDd@t-X4LUhG>JfZk
zPux(oLv08hZw*fAoB+9^He`JEY6ySg;F5U!*6ck;x>x`8+dR~MsU5X%kuM8vmiUv8
z=yZ^mua}a#!XhH^`gc$VvVkZRz1z<iBT!GKSzhsq1N;s^@K7uCb7f62Ahpm3bvU@D
zCIgQF6`*eYsfd*RF2tyE+I_azCIZVlb>m@t)sOBuUn5z(a_%31G(p!>4-h`Gp!Wp^
z7Kw<SI5?@r_#!yU0!Nkxe~}n_B_=X5Oxf#8x&HLUpt*ifPFPIbk<C2>1|}e?Z_gJJ
z8xDH=ilzvfdezy5tx-q&vu%sfc81y|>DVlF&z?GKYafl5^5a}jbhk9yA5weJ@fJRl
z8emAD`)f}GC$u0*F^-QD$Kn`^Ad#fW4jNvgD-zw#k?GBEZYX(^QX48)u55zm_XL2x
ztmpY1Y!iNMq3OYKUpICpJJhG9@Obq}$?`lH<gfiR1DY0v|9;(yZ5n>1xhk6(FdA)8
z$G*SihmSfx+qB(t<b2FnY4gGH<z<E4ZtIA>3pMnpguhM+iv;}b->(u74S(58C%4F3
zD;V$SJu3Or>+xZEFK)OOL%8wDIqW6Jih&ES5IcSpD(_aaH8jUO0}>Yg$S1jMcy)iu
zQ3015)A;RKDTkK?t1i?@UL`73Yb;faF-z9e_3>rY6W*bBMjtOTNrWu%LSu!`n39oQ
z^OKTCf0zBE54wczPTiZl?bDRQ{zTgQw0TOI1~9Se{PO!J!hrfkq%If}Dfq!Ra&<kS
zi4x?j{2B#1%PKt!X;_>HCm}W6J<hovIZIdH(e-e9LTxeFclyftj&q0!4dWNr??;{Q
zzBPnc|E;X$n1~DyEXk@iuYz+pUAnFu$I5CndEv46QteCej|CR-5HBy^!YmOVI{Gs(
zi@0jL%9RPb!=@aZ=UM|d{hH?Ex^kB@IqfEE@nx4}IGj0jWjcg2Rnn)eJ(+L!orNT(
z(pq!Q6)-Vak<+=*+*Q~t<TqOJDNRx~LVw8izHM9I#-_tYXWc}|q!x(yvY0Z)jmoeh
z+kcz`@wa7{T1|aPdxwz=EGuh6Z#Xl1saQkF5Vx-R2x`d7*KrlLsF8$>Aj{Thvuz%Z
zwblE8^YJZTI%P&>b58RzbhM)ZlZr1(|E34kT~@b6)rR)B)%njrgyUZ!^dQw3m?gP;
z{}i)Rol9&n+Oa1pi5_fUi^o#kdqUC2+iERcU0q=0l*ELmgF)C@E|&t1u0n!B`el>@
z(jZ+<)Ei?B4V1oHI@)15wRp!DAJ1MyOJ;X1BHPqlfc-$u_!^Dn=~fXb0li2Dx03I)
zwL6Jl%ENbNyL3*Pb$Yt4S$i5yDjMr~6qJK0*m+C)r=+9dr)M|zbk+@?LM6%GJPF}#
zq_uy3%{Dn|lNy+3>C2U?qA*<pbkUQ>#TH|{s_`jlQYFDQC}iO(xp*hu0QVz{<pQeC
z_7N-bQ9=Vznqy&cUe))ivANvJ=B9~W{h+lpWq%@EEq$4Wr}3McjQl7ayFxBrU>0dd
zu+TG5pQOg#|5CTpCdFhn|9&apVN8r;O%@{2IdAo27^wv`^MO2mi2do<4o;=jfQR2~
z(Vy1TuAtl2fJY1REwWeD7>!0RKc+2cQvSCo{9fMHyo0YTcbwb<#XR@eKVGfi+eoP_
zG|+#RDGYSMfV!8Iv|5Gv(%et^%y0p@7A=`@UX+>5uUok@a=W#{4#L-M3Y;*Hpm|v3
zI;{gS_japzF)oL@6}hjsR&GVfUEB&EEO%UDEU}FN9oIKC@kVP$uJ=u+Y_tFMLY~Rn
zss5PFa`XMNEhe*w>;7f&V`aPFwjw`X<MmO-Xobc(L-vlg8Eyschpw!RBH|YLaf)!R
zoarp2;pnV7m85-tA0SkQg<{mTg)%2x?zIKTs2gY6w8uhmywL}wMLwmNf1WY>HIBzF
zq|m;-3IFKB!Dr;L4slhz1&vi0l8(_PzD8K-g-k|<oN9Kn72PG}jbm_nY+kmNt1ve=
z)GSUXQ}Ao!n5^))EZe8ToUQ`6-@(S{M~4{Bfbc$EhIkaCVq!c8w5<Mlz&SDODG`Or
zsRq@H2FdVv1}w{&SE9<vjF%n>3pZQ%)@Aqh$+Z!t0(-2T@<};z>+T5rNi$AF*Zo0q
z@+*&CkKbMFlv%%y>KFND7+q(F2!5wkDa*SRh-#RnkNG6h{LcTZ+o?^Y`TDhA9%TDU
zw3d4bqx6lsY@<5Gz)9p&sQK+UiGh66T93P42d{)l9{4tU&(x6`qlOZ9kkaaaXqr_~
z)1|^@db*g6i4QQ`b)R!-8o40R<>%g6&%A&&J_A9gYGb0C&H9V?Lx*_KhM25Y@8n-v
z6WY7;4_wn6sV3fC_1*>e-@!es?*d4z3mpi<QJb}0%n$FB0l-^o^_8z6A8<yWB|6Sp
z-M&K8s^Bz|vnTzF@}K{H9hqN@<(ubCQN#T}m<+!#i3iAr!KK6whX&AaX7yN3Z$=_N
zkRKOF524ym0;i~%%BjhG#A_u{`0~6*+r#bk<9`8m9a=&!pQHvn3H-O9W*in0b6S`<
zw!ODESGP1K!oab1;E=C`SFXq^DE}BzOmo=qGOE(J4Ew~ene}Nc(ogq%!7HDndK9UZ
zG2WTSq(YQK)p!s?Z%AqdeZs!1pA^!tZuFfF4Hv+8Zk;E+b5!oD>s5eQS2l(q4D-v^
zA_QDU0oSo9>g#<q%#W;duRpc`k7J>@S4Qxt#tLVoDRIb|huy3nlNBWCUUG|V+Af=i
z15()f`;jib8|Wkf>PUHFQFG{VP){tJ_}~F5v};3ainZ~lG9>>I0FBP^NVT;4slMiT
z1hfOJ)ww4aP82^(YW4}r*@KKEVIw9)GwI$`V|7Z<e=EzUhOhNnzgj=`EKYK(yRja)
zrN8!kTTT8|DYuUGFD>|!hthvr5C7A?0=f4E`m@C&EA)}Hkn39O?-kykXIUK#gly{_
zOq#YEwEwd$wP@&UWC=N7q@?!V@mUw^vQ;@&6vuMo<@Y0}|Mao44KaykD2QCI9mjq_
zjSDUW`;i4qx#L;ry*jMK?H?iBN0BwLl!J*)od2um*AIX1Eg6&t_Jd9Ny+yl(;%OcZ
zU&;Sje;d$JlE>dKfc}^Dn!)Ip(1$w|$CHVJ9U((^|J3HzEc#l`L4CiSKZGk8bOwsq
zh_t4I!ok8ef3~^BlX<FKt~B3Oa%888r5k=H4PD$^OsOBtLl?M2YYDn~SPf-Y&vbsV
ztq)M(jq;o!pW&cmv0%@>2QkvWuIEz81%H=;Jqx8vQ*h#f3%`Y3;@3}kE;f6Tdk=H9
z_U6=ErlX#}+MV;rf38MvK`Q{OSz6PK59>!h3CC=i^w`8Ak1P%j-gl$p5VxXJ3?WSL
zOWEhvINfUX<HHM7UJuy#%Y+m*>a^T3zjGs#zAQBELOp`*ttbav2Fbx9k}V}R`7fyB
zXMQ__;zM$g2t9fHtp`$nwA1vqBriEjv67at@Nk{e!jKn-GRfpTY$aG1E&s(IyzFj@
zA_@5YXJ`djWkDSZaaIkCE%ysLA%VtqwYrnuf{9D(0Pw{?y#ei*8C$uL%5fbkyn;Y%
zw!QW)2nV52?{LT3^8aEImO_b1F<-Ykm!>%`@51#GQ=1<E@DvSAYHe`nG1M5=cwlcQ
zees53IVIcQ@4lC=p;)f^RC6`7HYC|v`oF;}#NUM)l@VoQT;AYP<NW5#W2F_~>tdf~
z*VY1n<RP^N2f*7mJ`5sMHwa|zvo@pCu#>jthWdIQf=Nv?rK@+Vzosaj<Cume-Xq{1
zbj_O97Zw7Ec;lzqQ8|Tge#008F~F-L!wbq`rl=KQtJI{dDx>;pE$)!NDjUboJgB2Z
z^Lw%<(;_<@P?9nTmKTaP>g8ct0tqPMP9Z1^`L*BspD`)^Rme(E59?qyYtsj<bmACB
zNk?0o*R_r0AS>irWO3~#t}L(C=hpoeoYks#3Wn3eOvkidMrP8^f5W58d6lPGiZ8!I
zPFwBkc5ySeO8%otDv;{B(W7XN%)p{F5$=wn1`MYwTXJ_8!WAl35#t|WP0q$5dp`Ti
zV7!VGdSC4vef%i=wsb{4rY35^HlBS3Z*Mex(pPc7a?t(v!*0G76lG?&+)ohd`E%8Q
zsat+*J=aLdx~Amaxz#fRtfOLdNTY3pe1eUYsAClKn3cZ@jQ*qZc^gas<)(2D6LF;=
zcEA4OxpyvJ@CZjA$pq$vN=7{;IELj4-2G$5M+>nl4`GYSy(m@VzAkXV>N$A{I$n1F
zZvFbqhdTUGzpO7sF5oRA$iyYC?f(5sT+?C8r8yoh_l40`Cte<{=|C5<ix&G%;vtE(
zA}`Q!j2$cPr!1v~`7Xf_VcuP&`T2z>?*N_nrJ#^&D(0;)_w}0v@$Wl)j0<i;^3f?(
zLrDEI97?6(bfvSmA78Ia&;2V9m1N2Q><>=~Fi4|;q!-;ZtaN|u-P?yV)<!pehB!HB
zpP3DT(C2aWJk0;tSJakEL3M1+NDuQQaM~2V?N@%y+<zun+GNKeIhMOlao{<ciGy4e
zVnk?8;p0kq$dMGdGPeC3{aPg;`$NFW^4cBMQ$Y?i4IB7pMNy@KgnWxSch@Qc@{b#-
z)qa-)BmDZ*DJKVmL~L?|Wvu2Yp22ZeqpvIA=3x9;!2-@d{5wo*Rs9*>ItT#V+jc_1
z;ALnXZengy6?BdNtO&!UYOEDI`K#8{pn}(wksD%^phrap^gnl2nj(|`A|D)Fd@Gk5
z2LTipUpr?2a~oxKI4giE6rV|@1qd|Ukkog^q}@OHGf0hTdT0rfTaRN^nAlxXJu(X}
zdo0{<eZ<RQShcL|AoqAwFE6}nBSRrR_Ue@>n(~AquH5@U7f^D46xFGWic&jGo(7Vz
zETk~1ksRoQMS@usbiCYi>_OSRM4Kp1qWKd66pXk2Ua?=XT~K{Rpj5Ve?o*bJPgv=B
zT)AeZRoV)Ms1LdTi3h|%mwyLq31WL7?$&|8^+?kaeYXUhFzKLJ*OnmlPGfjS=fukw
zJ}^;!kN?lo@3n}IRb`nouaReRw@Y3iFof@s%_P5CY{w60{^UcyHk7U%N+6LgGrq=!
zdzdVG!<VGGk`?6l#Y)~!_Y54*nLnq|Oxmag*O}%b)Y90V*4Z1-2^wvL58wzNdOrIf
zflC}l<>&P&0H?0k-qvA!m+6>AvMlO}C`qyM__E_^3!{zw)#FW*jw>)FfI}P3DwtlF
z7f5ujl+o~{Qy}~t$WDGm#fr;_ExGf=<%SH5#SqcU&!(8H)g1rmKMD^_bhKiiA4tX3
zo_XT#atdiV*tk!Ni_FREKhw;vbD<?+lnx@%&cG8L4PzLi*Ml%){_yoSl{h-4CdSG~
zMp+-*a>sI7wFJy%%Ko@vZQ>EOH>e4(XCbeUy>=@;?AKox?}R|n2Gcwo=aNjzIQz$8
z5X=lxoy$aK!CIkQ!F9yo-vo(>oryW2o>Yv5Lq*hGi?PGa!tb<(5oeT`Vdu^g$EYWp
zp@AjGb2=xUjxk3YZHtlU|8+sq0A*$`x9W?GkQ&-%QBsNJqM9*Kz@%vhFJpR<4K#JB
zOHy%6BN;EAE9LSzpJOUTcBHU#C>Yi`)qXLB2Dr1Em3A%JniUtCvZ)hSex~4k#I9ee
zsx8U3){oC*Wq%Z6`uqoijVCCAdJ6U8<j-qprP-Cksp#JqrbXhSO<5-svRoL&VWML%
zPU$%*;zrM9l-o#!yr2DedIxcih;m^6nDM)xI9h>6%g)(~(u;}1mPN8)h#FJ<Q?b|I
ztrS?B+osTOT{6hV5a`X|dD6DqxCX41oJr0nF5T%<7875vfp%rY<biXpoE-WBbnulq
z*xPyESFQ73&kjO_!Lq#v*Grn$UO@(l!{9TzmE8i@+_U;#<@rtS4Llw&#$}+-!fVll
zFL;`{bG^($!Yie9+tiRB_LAFSS}e`YGfWL5)twvA{)?{Pc8{qI&rWaol6Ya*t8a0C
zk38B)ZN7Gfa$=$OV3+bQKG<CvTnl4hpdL<49M9HVg{uqRB}bUK`SYU;t0<i$k{!7l
zyBehN(!>*GObii@WpXV@qGj9EHYF7vMaMMH-T%^>U*ViC3Izwxu@y~YbSm1g$+wGL
z3&#5xm*<u``bYWnw}8)KWR$zFFa*xdYL;rY-6y0I9T@>^^YT7;u5Pu&w*?Ppt4BVP
zY5MQ2g=f{tMWK^^t%V0&UGvA^YOfVf0;MU)W6sZKhm#TV-Uc|FyXpmS&h2OE6^xrZ
zITz;VrFK=3!{O<%WeTB_VxU${mFZc<8#7M6k2XfQiDj)cbGiHu`W7{{)2_d?Zt1?j
z<Y>50I;ThUmt}R<7neWjfWal-KmWLJX4OXT13NTerRYycdk(^PlpcZNSatG+`ll*N
zD4jj^7{U&7kjo8Er!GWAfz7f5Dfsf)7%Rhbmp>xiS@kQ*4x{P=W5F=#xONMv3>EB;
zXDpx3ylt<wba*$2Wst*9{TZIpewXY7ADNCBa>@<KKY4#;#+YPu!x#>Ex4Xyud;gkz
z(Ru<`97-`hp$3+06t*>X`)@}z9lhMT4ac@qR&K5+%H7_vyjE8niQ+Mes`766V$b!8
zZXBIypCfJ!k%K?!2w?1&vSuNUVrDqUXn60gm|txVVI==n|K14N<-uLGm_14$4`Xby
zYN2n{Y`aT{=w%m(CCeVLf6#s4$Iov7&;pn=N?cdyA&D8e_fZM6%E5Ndu3!Fv5h0(l
zqF(vjQimwz642BHXNO5hz7VPB$}kXZj%RKnAfz7$Cg03p;E9a|lFvT(<8)(as+Zmp
zuam>_Duls|MIy=!1awjvaItc-kYv`_^UkrI2CKQUC6L%4Byr3g`x`vtr}lNJkL&BW
zq1PJ;1?Gf|70eswjQ12a$WR{RhIQ+QMlY#6aHhg&0c|vuf<i8N!Eab4^EBoRn9{Ki
z(O;lvxBYgO4*phbrSs=Xnz7l7+t*_|vVKSScDADI5QS2ldectV6V96(#$KZL3G=CQ
z=Dr~@ZcI}5{I2}eK+?V40pu(F&Vu4`@!@>#20GKWr`u<NRpZC~dpf(b^v^2-o#gOg
zZ2cE0U0n6bI3LHpuU>=<6=QtB3Qq+^L66<jzt%Fdt_GvWK^=_wH~;b|=46XF#dFfX
z(FJfP-t>>Y6?3m0MdzaBu&TZlqqJVCV_&o@>&DJOxp(9iE^ln5`aQ|kweEl0qCy!b
zsJbv}st)xdJ?LBXZ@a3rySuv<q<Imw(^afUuq{M)I^5Opr0~P;3?e&tTnoxAH!Mj1
zU|PoJskz=MEcZ4;uU-Oehtw~AK+!4Ud~2T}oJm>HOg|6JMF6pMXemM<vq*(L6+Wq<
zloiBRpIc-E?M0~-Zv2})swo`Yc+e%A@N5`A{0tnum;U?jzh{a!*{7#fsI@eQ^TKH#
zn;MAQmZ^#>?Eur@PfdM&_oX0!u*X8X=6k6RU7>W+L^vJ1elR-&l9394uqFXTNu0D}
zkBUR!G8ZgrUPuac?CWx908~b$6^leIVPV?yim$45ie=1UgwE%G)J-C~Z7Dvo*%ze5
z%NZb(wy>EspAt_F?UG3te?e<bih&q7sQxMvb>#q1R&#Nsg|5C$zuQ?inx;>J|8YSo
zE!q3o*=6gR{<Ssg+4;=6aNAxcx~r9)pEZ=n<-Y&K^3U7M?0b9bd-)e>k!G_D5bJls
z3Lt7HKeUs(32MM(f30x|&Ro6?sX1-Q$Zl=kIxFt;Tb_x*_pVCe4eOI7+j<NCT7r;_
zd;+`Psn%VL)2Ty_E}@kx$s>&(gRdY1rt9$eY+dJEsbOVHCP~J8>ttzjj8?f&^g+^^
zX5YaorR&~~#Pl}FG4dphZg1>lH}~${Q)sc$dJ6CB^cN)omQsqAvCjO_I|qGT>3Fap
zCOMBwzR-O$5NE)&omSP|9YY_hCl^265~~i%fFf2$<Ase_vMtACu^~T3buTqa;F+Ib
z`QoHb?pXFYy^V`EePsP6i-YwQ-6aM+6rqhNR+|15({Lc8+X%gnkmQHd*dLM7xS$k-
z4;Ry02n@?|iH`0P+oOlxoxv}qp_-Vw+L>HMK8~T&N%0uYb1jqAuW!oEK77_5@678;
zSC&<+P4M~I7eHa-K!CctAT3}&m8xZQVmn;6h5Aa4>?LNvBV`K??>(B{bCR+7I7@d0
zUH7XN>m<tyz|=;Tiwm#0=@wS(fnZm1{Z-JY<Vqfb@Yg8KrYhkIv78zAcQU-bDk~4|
z+&X^07**iGKMeW_ZbNkO793Rj#SlHcRT&t82ew@FTRYe{7cJ%rFQbMg2lSL6CewCu
z?r$CYwRt`Y<jJ^0dO-MMi=jqb=c~wMfWFskrat_pgshLyc}C-`=%92W)G?W<hcy4=
zzl$dTIvBpcB(xCYS*(^=+a!rui@d3a^4($+acFp~7;nJB^uYeE9(LyW8EE3P`mGK(
zCAXeibnN+w&IO)aX~y_?Xv1Rn+QD%oALI^6IchN+V26kl72Hqc)+7-YpKn&w40}Z|
zHhjUG^gCGNiPNNa*BW)PR~5&jM40o#H+`u6n?I0Uqhch*22lX^#Q5Q59#_`1KjQ65
zU;lPZl!F8ibTR4x3{TOI4J20r`E{;orMiL+r`$0~5RU(w6Hr3VgPpqSY3JFqQvlPL
z5*49yAtgRXk5=UXD{-WDaY@O`G)3?%JtJbf-K575>kSG$6oz@eY(1sM>`k;S#x*Kt
zA-?{-NcV9Xlp?QA)PVb}mf%6XP$JkPW^FsH<XqB1ChZ>M*m+?J8bCA}#R8ypn?xMF
zFDa|Qs2-+^Q#O8_tOwUixp#MYXkfW0K`i%6>xS0w%Y{BQTm*II=8(p{>IVg9*1}Z$
zBQzd+i|NR$+fSQc&nbKEewe|SmpLW+san`>^y9i7cKV+iZ&B8*#ys(Lxsu%HEkTNe
z6S=XvDVda&M~*Eu`ws`+>$!??NrmrY%H_Vj;nv`O<-V_H{UJq@=dIWN>jC?Lmb3Pb
zPKoj%KC<nl!z9hdXqz#(iqAV}k$ycgT5veaNSgIY$198ep_C{qxFTnAIZp$TGC2H<
z@O-oE26Qo+wyn;pSb+zbo@!;RD%oQDvxjUAz04lCJn!x08C7AcCGNr5nDs-a-Wt*q
zxSMYbQ544U)+i23*r`~laQ(|hW|nl`a;s8Nr^>`S^xLvS!cWmL#9n7ji8WDu>TAtN
zNQ$uuc^h#|&#WM#Ff&TAlt|Q@-WN8|m_YE*jX+#E>IXuYA^7aM&Pt?wp?m#c?lY4&
z-0WEuxB*-{yH;`*B3?R{{W(<OY2c4;&(-JVE7c}&9FH>qcvhBArd0QR(iqBcmmD9+
zsLGfqyUfE9_&u*Q1De!&#us1BjN>QAd2Q(47l)4t<M=%Mbn`sxv&lR{C4#`xMWP=I
z;U}v<O;+j6Dh(cT&QtC3U2Gdi1_=(Endxzw>Pm@hwsj%L6j-(6kts7A99^xxy3@IB
zd1u%)iR%9q7`<HtwnJlzdFo`iE1PT^^S`KPZfae=#Ma!gS?lv(aEvw7@Vb1~2&#B#
z1)01QVHPNNrOg*r%ozLasPFK-4rsO75jtM|out!uVrB2vy%XR5V>oDH%<eu)c3Ih5
zce*npYxy_N^8NcjFR5QfV9J&5F0VyeneT<CXHyhdL0ll#+8po;%Qa?fU9-YV%tsBM
zT1H4b{!t{KYgFVq`Xt}q&##~ykUh0byzYYotQQ+^{nONbo{O&OiW3^E8X_X4pm&L#
z?CnuCNou9mqVZZ!VpUWtg3%9YN1=-sOO-*-d8B3uu#j2%(3R0vVDvi;j*IgPzN%4o
zBqx3cT^syu%_2P@12e|AY&>EQ`WP?~m#k-{7bFRKffhmQ$f<7|8+ALq|0}RbbXL?g
z))-a~0oii--jja^REM6375ny*so~*UtJU{!bG-*n$%BIfZ%|dExn8xgqIDO?Lsy9r
z&QlppLX5_K{@3$gKl@4%dcwS@cxT}aRy+P%x9E+)bWE0aDxrPQp3>7`nLbrvo6la`
zToAe1gO5`)#yhICy_~=6*!d|zeE03?h&jlb*)xhn)0g9cg8cT9vyS}V#~#bgy6K1L
z&HUMPe|@rdy6&A|T{c?Q+EHuak8sT!u3poB#5pLTVT(+?_bGRMJoRj5+!<PiBg<tO
zMr(nLoY53iagMy>%7xh{!M-#`htckLRuE|qtUBv?XZ_Icn!_E8Tl7Qm*>P+V^zrwf
zKqAk#9smz<?AF$AQTf2ii{wGjz*>mf+ir+KYnAB5Lh2N=E3MoS^}3x>VwXP!UOzpa
z&;r|HWM@gneU1hbEgSR>O)Je+@_mN;;rCNVhunj2<&k}jyEoP~m<E!C<KhjN2rq7(
z2tCqcxUd`c`_W9>hge1>iCBp<h><vK^rm{=nJaPX6n$aEQ=V|)mN=7)>s<6+f0bTc
z_-gGCQHmwTuH2Org9K0ds<Fi;2V~pkn*-Z=r#o@WZhAXoVU1S%<1B~S>E5CjO1diL
zz~z@=<0NJ)8uzFEiGec&b-y~i{jAspXN8N>sZ4C_D_PcXQ<mlguJkSA$MN3GVmHV|
zP25rssns*KQn1ycXaDnP)4w_b*NOnpCrYN(Vl0I#EFdF1k3slf1eKmkQ)i{Ai_#Md
zTr8BVz4)rK*(~dm%f4}*vq4FtM{N9)yo(^OHuk@rqR+rCT)ke2oXSPNc<%AIx&%JQ
zE3!)_R}5F&^7z;L@h2DReqBhxDyZTT>^<68t8fGXn5Yw}-D%}V`+^XY3jA86Bqp9j
zLZ(2zF3w{>iN^S@QVe(j6`8MVk|Vbk7#6qfCH1u<HG%Vo#><wCupWB;d*SVe<>ePp
zRAQA+tIM6KVZ3sx;$(KsKAoW7Szn?J%23jmRK{uKenUJrNVB+j=OUT?^^bz@TB$b-
zKR)m2>+93_8b)=w^QJACNB2g2e7hFOqO+Go{Wl&E8EsOfj{m_k?0o;ABy|qdN{F-P
zk@JE|DsJD1f>bU3i%?6L2DYQwrB(6lanL^=tSNw=3=OrT7IS;J$NbFGp1Q&@L33jz
z3>d6V6skG)9PG~%sb`bygbgI8mOqEs*NMIND#jN95|L*rZ<5(w(}Ei8TM^cT?ec`y
z)VMyzwMOcKOA3bj#!WpFGEfOLPRHVGDYZM3R~v47U~+ob=-l{kdPrbqY+V?XZbDK;
zgbKdp#-*HxM3=O*2)^#TEqRUI>b3mZ8mpAByNyANd{_MxqX)CkF*1a}hCd?(ZDb8H
zD$k8err><p9y>gNjmfbcllurEv-`X9Thn7wX9zA`hdH^19p!RYp2o_sY4`oiWoNCQ
z;n+?IvtjKm+M`tUoZHOIU<i-8nqhEH7$V=`<;@vS2R-j}(;!wh<A!IYoNFpaRR`9M
z<oEFN2Fc2&PE6;TwlMEG@2hjaIvo$ccLf&Uyg|!4vgUdV;R)pFbm!!~GkVhu230=V
z(y$h-@OXE>MD|Exh}(Fr<bv%AF`SQsxiY%Hi$nhmBc#Uf$T87X;0-VPHbtTxX+|j<
zXB}!x4;ua5h+DegCk?6jsw=>0G)oqN*sw5CA}eK7tRxa{XRwMW%H4gM;b1GdAs)+{
z0sr}4=gwmepOsc$%Y!T<lcrS!qXA^Z+M^0<?TVTR=gC9WfnrPBlvO;;Oj4gA_?MV2
z?8exqoQ!vNFH!jw9YJC+b|=$(d0-cLoxj?|J=ruaPAqT*=f~{Q=pd`ST7?&&qUXIq
zDKp{Av}!d`(Ji%EJD#?ug#-_IrF7T2S)Kw{Ey25D!`>zFywyA~X97RB)d<a_5JmUS
zcQM!2#;t^0?wwUVm7HvvZ=?5QBH7xf7^jG-9W<|+<y$WHjhpKD8D)cz(-3(1Eb*6a
z!hcqdtpmrPL|cu-X`b4|oYtHDp|*YqOV9}gS(LS%m^`8T4;Hf?C+dNk2=8wNT-s{~
zn*r)!o|p9|b_>b8nQ=SG%4bDsSjzsq-@}-b^@&1a1sT71ys-#y+};I=-VE2bO_2T`
zpJrb*)w3DPb)=gEx_NI_L03J9OM+pwx2tPRvhQT8m%tb4m8M^oWhCqF?@PoQ<9W=x
zK4XfCo58K-tXdN3ac{l%CPyV9QcF-i7bU6R+_|iE^slXNjOKV2|6%;#f8cHYvpYwv
zV`5|3hJYJw?x58Kd1vjK$}D2-YC`^B*NO+8{G1%Vc~Fv3U4DYGDLaQChTyd(Xsn>6
zMWhs6pW{}yaWFv&rS_g|cFji%nRg;5W7S!KXf<>Hb=H-0OFlof(nZgOQ)h>22(}v1
zb9#DuyAc9exYDHBa=ZrF@2uMT_dIjpZT~W@e<8%Bk${lRrEKr5*{Cp2dgmzL#Sg(=
zd;9qVzkKoAJI-8JWv>Y?DH%MGVdxH_x9#uI`MywBDEoXghGBC}WuK8}Z@BN?ZT7ln
z!cVn-jj@OcKg~V5;J(#tvHkRx-I5VUQ?zsW$e6^=fKjIV$6jrLsGd<Rh_&ru_M@hN
zYg-9VtG+RI3%@C~GN8CVk%=7J7O)P7$s3=L8z@_spC-wAz&LB6lld&~kj$TXT?+#q
ze%|qQWrkE)jc5R~l`!^~zY)>Dv-fD_<smHhP%Y+&vJ@o3?te!%q*#Li4!wT#U&ng%
z-I+M6y3&xRgQehs%Ts7RStVibU?y#w{7XVwo>M=JV@P&RaX-IOH;^}l8AC6<FCOOx
zaZ*IzME<LK=XEU^!p(37!7NY-$`+!ApBOZpdSuIpVR_T6ae;jGLcyp$BB6kh`C}z#
zMEC<~Usr))4u^R5?G@=p(kACAH*R}hQ&r4c#a3TCc5Q)|!-3<H=gjbl&#w?|eJ0I6
z)z%|w6s`U{?<~2_*`VD9k*we`zhg#qN6G$vK0B*SH=-;uV_QUnWFU$X!wWoMr;=4J
z%eknqeA0XtOX#Ra?f0IxjS%BjoMeLikCgzj4axrwu3h<&<(leyzWYC<&q&qG5KQcX
zfJa5R>LlMKy<eSj80D$v#8H%5+0HYO#}I?ci*NC37CzWXNxsM6K-f9OgR-%Pr>ek|
zCS2=^Twp^7T?f+)Q~{10Wcv&*NnCkw`}Wdxs+Rg?*t#~@m>3@9@5Cfe?c(nmmirex
zXr7m7axeIqq;vSlC{QH%!|%PlpdLOY98adM-gvFtQzB)1X~~@UZ!!8{Luc>uNxPM{
z;4?s*<|mI_T`sz;J0G34Jk!FdNcZk&{z$&>WY=9#0YV3X+}68pk<h564LWviaqI!t
zEcgJF>6t>d7BCj(>HQkc;SrJ5m|qX<C`FaoPp~5K&io&ykB*{<Aq3k@O=Dd>+;pKD
zbpgD)B6<#&c4ESI|E`5;KS>VV)I1wl0@8shmZ}8)vu(J!qhp{7pUd<qMM*BJ7vH@s
z6!+!&qYuE158Qyw^^5n*mux#!jc>Riw~R@wgFtd^RgC>!H^14s7<sU9&oOE?G%5uU
ztQ5ia8LXK$s3Iwf<5B7Cc4)xy2mpae1C?6El|RQrpAlZN8oqR163?9ZpvxD^2IF&O
z4@CEwUG<>c-=gC{)e$L-bO8LESfkSFTVUK5QovsRJ7j=h3rUrnphfmLL@#!8^E!?`
zdj5=y({`AHo?&U~^#|Kg<%{}O&!#VYXQ&;-|7qYVz>lDc8yN`hbL3k0PYL+3{5NdZ
zV3B5=eJ$@|6!(rwU0%6-J9J?iA8!{&4|wJr{dtq0pZZ81>5%L4=Ci1<e7I{%YH+ul
z1tfN*S=d+B>ZNCr{P$Q;tPD}f3wwm(U}t>tx)cgm@#+^&$^H@_(6-pqIk(6#rIGub
zPRgj5ytcO16yU!4pt)ntH)eZ`Dqn$LoIuN%p}Mbsvc>pfpb*8McFn)yx7-cgnCSG%
zMng=RxT~aM@y}9OYZ**bYG>Y;ePRj3su2`(UbD!gn&^G%@ST4o%X)*MOn=G*4n@%M
z^wZy~GL__jJ>M%mGBZbVItjCTxN&C;!3pXVU6SWFP4{msZdtzahcJJ~LV4uYsiWN|
zUN+B|HObtU9d%*!!LK)z(LC&l6ikqN6^398%#=4E$x3MmL`iv|KuQ6jg=ym)8&nU9
z72odXuqNDxRu1W_B;114&t{k%molZnOwZf0SB`hM)z7Y5XRz-iJx1eX_M|L=n>rjm
z>$NO`HZQvnX{W@o0R5rCrEfht#U^*31-9RTmtv(VEP|LAw%-iAr^bRstEcPg*RKjR
zttoRu0-tYqx%a1M&Ay~jo0v_lsevN2Hy=Ds{rz4bOlf)B7R{S@U9WpbaRMqMmV|q(
zfEGW+@&;3Ov7DKsXNSfQm=h&0U-s+-_;2q^p4K@E`@0ZoU-M2^&YkBaM;SH3y}^d^
zfaF_OD+B6`39E=8P#yuS<j<C&^ox<SZ3?lWhX%SAe7}G*=70Zz;5J1aT%(@YSib|#
z8W9G%rUgMi{}0okoOdsBQlsXLA5=t4WI5EAT)R>L@CwU#P&u@>UIUUs;a7~8*Y*I0
z8xUGRK)L}f%Q0)~pcSW9*h>i36gpGkN8<G_DBn0RVPO3p-L|;76W2a>zpbmd=|`kf
zrgw=g=*KNi4mKut-(n#1(`Sv+w7`6@tUfPzwuAbme$Fp+Fo3Jde$iJAL{1gJ<lNz=
z_ZgfzH@F$B*A*P5a^HNaLdg-VSsOF7eR_~`Sb{Nd#LtjYp;=~c7Z-&;;qfmNS2sw+
zg(T;^{AP6hI#m9XvU9;|5slx|$%}Lrj+h(=>d|;DTfEg#>;JaSS?v7U9#mN41t0Sr
zd=|nXNsuFKMBL^s8?qu}yT$UCF?XI<zc-=p_!E}{iOlGE9#Z-8shy=Glbf|bH3S|X
z=sl7<xUw?-nlT7@Gaut3oAq8EGVE@9BX)bsKCbMZisG3D6aBYV{~ak}Iae&D&KXLZ
z3AUJ>J{NwSVTFc@pE}d3vydR`em~74{B=w1hq<vo?p6B|!d~pAEOh79VTO&q16@b|
zNQ0a8zI=~$6%4M!Tp8#-f&Z@#%Lg_Zc*bqC#vFUb6u2B7ES`wGZ@Cm8M|*8jv_8Kb
z_e)lY`l*Hoaee@a!PQ>9sN^-oA`@Z|d*LL|)S-fdLFL^!H_LCCAC;aZm)%Qlw$QH=
zysg8HY<L->YHO;i%i-jdXn4nC<jMKwZ%GoBeiz49#I2Q}<I57pBvm%md88<rH};)6
z2jRX^@kU5~XQx4(>^SWEO?czv6DWH2?vv@PEX2sT%s5^-r>NsCnkhbsVyXN$y(8<n
zIpXbAoLh2j)1)ojqOg!t^qh(a1W{#V(xb_<<>G2guY#-eHW;{*Y)^tAQUVLa=^5xz
zX;tGmTh}{KHYXJvA5cM8@3;k9DB-?-{ptln8v?+-<!at1&NsAv9;0~6c(neLI3t7j
z<jR$RH@(KVU;hpUC(_Q8*vJu|b|7aGUUQ}^KHm?twqx-bDXy^5OU2UEgI2;zz>|8I
znNG&+e55RfS#EgmH-4Ds4Yqah?*PB+s74Y_)Ku1ed_;q|?dwk(B~6FvY4Q6-KRk|w
zrhkZj*j8+|2owvfUmt4bK05^GoN5+z*AS>J=Ucv)$lr@X2Tc+w8TiPal9PY_G4m=L
zNAr}U=rKfvdPUZ7U?t`UQXZtJSU%vF&lA72Opw|dK$Q~s{`MXHv*TDAl>|;njf!7A
z{v;;4X&bh<W<vDQ<~*1pqyLvbiHXobo@V0?kBsb|TNHK%G>L&!$ra#NH;Zi`_G{2K
z3;jXX4=@%w>xs|A<DF97>6Dn*ykA)uN`^g{Ov6Lz`D)!$pGZE#tKWT75+UpYidoMD
zNQM;GeGDgtY!s?B-0;E$7A^ni0saE%7ywBK)-=X*>f&2{@k~QbAAWL$X$sCR!>POi
zh2@edAC>|R{%uew+nTJ$tN;HKA&_^v_^_Txrfh%NxyX1<KKBakJI4#aGyC<R5P))x
z2ZCZ^Zn@zC!4WlGvJE`sNXm3{2~O!~;scr5F^g#LQb)ous+qN>G;}XfeyQieXEW{w
z!qmlvkUN>#Lrr(Y-|A)!l*I|I@cnFjTe92v*IF>ry<&+4FRPPh({`wNN`{M$4iY41
zB=~1co;#|>X6E_>Y=4BtsFVG#HNB>r3N$0e=TkBn7qs7vdhXF*nc<VBryN)H3fI4s
zo<sBnpVg<McXJZ3VUS|}$Gv!ezhQ@@Rv5h)_82K`S;=?_L(k-57|17NB-sH)lm0bq
zQls~tQpw)HuqqUP{!FvQjho?f$<{SimaANOTyUjon+LJU4><0<b4huG6Ry#|Q><q(
z(xocxa~k9ByC>26iZ0X4<uYGs^#o%zo`+e4W5gv~-V2*E^5u_3l?}r*pY{2NvALan
zk-wny3>h3E`oTGIJD9)TOW&aMHB$}RrOE7TMLn!NV+Hq=ExGDckf=JBv@tP9Ir8$F
zI_oJ21GOg<pdwZ6WwAe$(*YtcODo-Vls9v~9=q^A10+o=0mKERjz%?!SXtpoS}iMt
zd=0AX`zC9-gPSmCl!p?7c^2>VDotIv=ONs}R2QmE{9b%d=_%Ha_h&{?2ue|p99e{4
zUN-+fj?O)h>F@vJBMRNN+@&y7bEmmXa#teM+!x9vx4B<($t}6eU2->yn)`HN<Q_7F
zQDb7^Be9TMau2`r{q?W@Fngc(d0n2*$CIeHFS>f5k|KX;s&i(Grg|?AOG&z%$UADk
z3xx_J!3v)3jv<h*P_A7sOO}V<@uS4TzMNzCmjT)fERlp;Lk=Rkuiov8#p->cE(kh$
z<YzZ3)cUmA0TxG@uYjIzt6s}SG5ZG6caGjnZ<`*&7*w!*-JUF6m9S*lx!ExNZ*=1<
zn?B91d2e(}hTv3aI{OIwo4P@r3j+j2>Zaf3RQ!LicNSGaoAq=)$N3QEs2c30Y5}G(
z<i%&E`4C^QyjaN}bbj0lRMEAMh?O=|kLdKSC%pjA2gr0Y&XGDXp!ogwcb(cbd+fhj
z|F+epW;%V@>(z^H73229&1v=P>f}tXv%m&4fH(woYh)WL9-1yk2vC0!Z?CCCKUm@Y
zpEdd4kNtwl)92p5^(rn<3Z6D|KBlSfI#@b=TnMrgfJ({f-`IfZwgh&m7*hipZCE~q
z&=r@x+D{fUY3h{2gqkNGrEUcBrS_DC<bw=Q@97BAxW}GR&j1)^0sk1|H~!?*v0kJR
z$;nX)=1X+%FDavMh8C9Inu~IOumvn70Nd#|x9r8hUwbi>(+~ORN$2dCb5JS(iu4Y?
zt0|_*<6wkW8gj{TB-kT3OhSCyw#+aQ&*?viU>d`ND2igwd6mciYfeeD*$v;rMcKu!
zn-}DAS$Q;0^5}AfcI45-&y}Cyu=sfU?86y}7wn0VE&j{|!)n&3I~`$6_BBg#CBND2
zs>XO@(ect*ufkDt3-YC!3?NtgXp^;-(!bu_*_pIdn603Q8-<zlaj|{4l4^OTfA83l
zi?gLHt9M9cq*|CX{_MfZLzYnfFLn?UqL2Qb7jyCZ@mswKb!Myd`GSL$rGu@ulgHg_
ztRdBkskg;tY@Xe5FZ|O>f`42oMH+2yO)mKsfi#14M>q`TP~(m`SyEJc@;);Qf47PP
zO@vQ@Cq2E|)m4G$C@;ePG@Z>#hg8Cs!r9%e%=$C-*$#g+Z8(qnP}+tGcZvxL7cGH`
z<mc=Li?qA4nr7)B9>Bi6-0bfb6-GnYdm$pZdLx%@o*7EV7M?8TewG1IFA!=AxY+5w
zrGMPbI#{AZ`{sWk7n^H>rV35UdKhIJ3*}oYG`<-@#=xkOk*Og~tP{^UhG`UVtlst<
zdu02<FTV^$V8e;?ew7QT5=;k#Uft!4tm1O~zONv5kc016eN^%nt1Vway^k2|DQoP&
zqjoSJane88R<BRyyLlrE)V|tRO~44l6w!?-GeTG=;b;@|=H^n*_?b}k!W2>F+#VDe
z#jB&M9-}g}s2B~Y@q9!l`l%xKB?zCQJ03Fx!Rfo(x4h}c3Ug8~GM+*yl+Kp~wq+}%
zI2wGxJVpT>LMrOAC!H#oLMmuue5MV9B*E$MZD4W)HUPmm>~x^MYp;hg9i)YxE!)2S
z@tY`w&oMH2C~|#|xACXZ^lE(p{$%uPu14C}8z8XSyawSbOE4U)h<d_d?&z)k(!2c#
ztn(h%Y%;bru4|}~i2H}U({Ad27Q#~GVawGUI4i#=Wj!^kQ&a6*Q{b5wOzWvxj<dgW
zSwa^`3i`z_rt!jmH#yfX`%`)tvvUEdsH>1?lYebpqXUiSLlW=){4nlSSW8tq-R1N6
z@&2-GTja)Q^Dqn6R$83QGJwl9WODR~DqTe<Q&sM<6L^Xi<<5XM7vjuyg)kuAt-|0U
z61rxFNwyW-RzK9*rQYeeBjyf<H15Y0XTG2GI@{2YmUb5O*P*SSa%F(0^f0G=FU#Lc
zkX)_38FJ1p3b8b@C=e{dlm$l8cbk9s<_4s=6H_+AF2rHR_o303wO7NFb3kNE?!0eq
z^1EWUfa0~i&831aKsbvWK!CkUshhc??*|m0q3txvk>Q(&VA5o3TUK@PG`4<n(|TCk
ze7Z*ap4A6?Lnf*}KNDZDh?-R0S$kmDTVQW1HLa0aW_?cWvV9W3BVs)c2Sf2TsVwF-
zR<$txi0rcWJbJ`3*^NQWgE1zzSozKG^MSW*G{1~Kc1OMCMPfVHEkBtF@r~J3%%$Pp
zs$2|jbl6pkZ8f{5T`o&`&-@}Ud!Y9LTU0soT87*$vyNk47g{Gh+u9~4y_@3P5qShT
zC=*7@=?>3vSV(66Mpw^S@tgwiY%YUU@yhycu<HF_0sorL<Z?5O2X<Q#O*HK}qP0u4
z&RZ#A-Orn^xJ_<+8mi}Ol9JSRirGyHId7+Z+sZ27r-O*tjfY9qaBa-DnJV3rc~D3%
z^jw16+bAYPve=!|n%quuZ1*AEFRP@FTca&>IX;D0xYoEe;lxc~ME9(``!ePhQh(<J
ztc*TGS=WPAvoY-J;8)A(&ufgvDVDOQ<^g-W+$r~Rv%u_hccN!_KRhuq6O#{x2<KEA
zRg#WF{{FkH%lOXNza=)-lqjpk62_LV3>Oy9$^0Q_41ZZO2k~Hfw-hR@3%B(!1{|sA
zsH>%$($M#BL*+!_S52rmSH9P7_mIb7J#N7JAobOIzuqli#yi72p{{8{vZKl35$PNn
zgkZib@o^;TosSNpMU!YDQ5b9j!WR)f9pssVPW4R{KY10F|6lAl=GD&#AsYjK89>Qz
zQ-OTq1760dmhh)IK&uNae~s5>Gde#}dH~AF7kd$cWmt6)2ouQM+n;%JWZC}ja`?0n
z6~rRo1`YrVezNu5BLOf|MoZTAB9oIk<h7@_K^3~UweJnw)9CyHve2VzGIO)>+wJ?4
ztLV=rLpOOqu1Nu%{?zVgd{;4ji|+N@!Kor?CTlxqDNx|1qbF6VeP>P<3%W@IL{2~9
z=RoIV{q%ouArVj+RFB1M(omo!0)_5my(c}8zI?p;#eDFE<5&)UZ6x`GBNXjUkEk85
zH@ceM*9h+g?>V5feZyO`>*N1;d(q;<nXb`^b%Qak=ZZAc^RM7c@j3Q^dqRHdv2&_h
z(X7*OK)7-$*qXm6^~IakIaT{_|4%!=@5{$<V_|UwQiwW%sP}T<;z;r#8YoYCD2b2R
z7))<BPu^%9^sD%+%NN&y0ygjhzr(c|o!jFA&EQoM6!ms|8MLAY06Ip<yeK4;hn@DW
z7I!Sip4XHD$&JGuj~cLk6x6GZOUR_~e%V7kN`h9@onxFV?P@=A&au9g8`5j@(XcVd
z`DNgAKsUE~{a4!xcg|4DN?$OkBdrC0`~4@$#oe~*TLJ86qGFD;O9AZ03QHViW%t_i
zWGk{Zqw$!61PXwSAdJ|5sLUBsGYM^hmoU8|z+>jUkG2{Rmt24g?q#N)okN#9f{=}o
z5LQ`&yd2SfZO!!NY`jkLU0bu%9$-ZW)D^OT3o%fp{7Twzx?3~toRROjXDDY#OLJuF
zxJUM&fTpy#k)w=w_mxh8lQPVRo6&m<T|EGi8)2wRwi5gCnZy+$62`CKq)mmwqVR`4
zdsXebXZ2sd&boi_LoQVd_O^Y0{}s3om|}Z{&!Nsb^<s3$n3MXDzwf8d+2oWgT|W4F
zi|@T^Y0?B7#{NsmRQI(74^+&->MwH)TZ}957vWRl*j?LVhfeX9XOODY+AO9gEP4uW
z4>?afh^kJ6-O;z|E#H1?Q7nOa$mRHM{j|?Lf95N=8@d&#FrHDRl5$xN8AB-;d~bSH
zrw=${s7T|k&3wN)vhV|uTtaDX+>$ui<5bA%)9pqs$Sb8A1TLMbSkb8Z$|lj_Ym=xl
zo4<e^94pxqavZ`lQ)wo;UOes*WJDE)Ski{i!roGz%QHI=YXOL<zh>2#VM<|-!9I+_
z7`!7p&4&~bNwmk;1FW3#K}?DwEq-_#oRj8*K``G)J9~CRaC^Ts@DXrtw{A@Fmj$i(
zTtHXA!9}{C($MByv~-(?U7p1yIf$Y#mY;+JHYO7xjniXTsv<f6Qr%;$48yRZTg4^2
zRVW9o!=}EoGza3$kjBr8j)&?<_=9!N#)6q)$id#=Lcrn9!t7iaO1)svc_V2RT($+?
z27QsR&ZA9@f9LH4?)+13Kk7f))7ZC66B3f?GKbxBoK!h=HZ3jDo}3GL&^#Wtx#H=X
zZ!~>kbyn`+@Ne^aC_<cd^;52!V5yq|d1d>tt|_+7hT`P0)?sDpf05iFu(7pF&)C=v
zpvkwmX|sulcs!yXEX$-0ng~thWbPS7QjxTL02LU-i49QI;!1Z{=&L|}K@9+Szwqg6
zLM{Hj`EUI59hm~+JfzvR{BG0o2OrhaSLvniSa{@(y4Hiy+6IFUo5!J@av-+BlXq&e
z4QPm|0Hs~ir8p+&h<6lISSEAkE5($0LVI^1f3#TkN~;DCmzaS14}gQUsmMy{F7u?*
z#(0u>?zxaM@|t5xqs0xe)$>M2z(gL0!?`08un22iq(`Vgnc;^KOgLxt#%qV1&jZQA
zWz>!K{h&07f4}yN8lzm#@H6Y+XOTnS9Oz01rAhwPXpqRTzUUS$8Sv^ujFiqdmmrN`
z{#Kp|`E#<SIZ{{_Y1@<6LdHWxN>++2`UxZ{nI-vW8j**$`c`mO)+1b}nH&sI;xIW~
z&oK~i(W05m3?cK0oy%U@G^ktJS7+SD0dXveC{LLXXMMG_*VAKS0Kyqd-$$066^iW+
zae-=mw4%mbn#*+f>SU;$5&P<+f$8t2?xO)z)o+)m?k9SB+X8;CYo9el^hHn+&2pAo
z#qu!&S;^v`&<4qz{)#c=I|v&CUy73F`FsdGa&zpcFXVDd@X8#fuoiRgS)Qa56C=6y
zG$doY@u#ogTy2$9fakTLjTc8x$085tHyKX@JFHH+enaV^%RIQQ*o?hKemSEpugmnn
zLWm8nIKV3B{`ri}pArZQY~CQQx*HP<O^5WAH)bpHAQi23LUg-~Ej&NHKN0(xJH?vU
z6LRGvtEO<RKGWDaYwdFvMv6;>d))^_nDG)?##}8MB5!~HP<x+A5bxwjhr+BH-mk^l
z^K(4i%VlJeXU;JaVs9NRG*ZNSLTUzR0t!E)x2I%_#>tsBNaN2^>S(=~(|5moMJEOF
zz_QMirB)nASf=`i<sqJxyLFXLTQMLnvpF<T=2*P^XTvuQ4Ud=dkALBFB)=1mZcxg>
zV8uUSP-JjqPoWqIy{hy&&zk}}PK818ppu&p5r}_={lcB^e};)EIF%iO_ji&WGa#!t
z?v?V!_+DgnZ<s_9Ejf7L1{DCyM=dj1joeE|PyDVYjQH}M&O114bEB6oWj?e-AO2l~
znv~68ApgzJJJ(LO9qs;aA`@uqDo~?H7>SA6V@XC3-pav&4f^C|p0^ZaifrRQ@5o)=
zqrHYB*d1lxwaBBhM_ZPYwnR0?znujw-ADVAmfr2|`TyfIyg@Mrox$X%0R34l#aA$<
zgYRuz^jQ|+nEeC$+IFDo7%KBexTR8^n~!}g16m9vvVgYHyFPgkjMR-%G7`1HBBFqi
zyXCQP$xLXT<AC_&^kmfqLwbEfP*C_-gDoATy?7^Ys@KsTLs&|rn1MFzW3bC|1pVGq
zDI3Fl;0oW}j@;hII|{!i8*QkZ)*+5!Tv|o7fu*MgaZ{n0%d}!{bjgzWYYaI(k72z}
z!+%v9q;l=I0jckZz};2Ge-}(fua&`2X=y-?;tO(UyuBQfQJ1j9OTPuM)wLuG>yQrT
zgV-4Ln2Zg%<f(gikMSazziwZD7NUMJZ`V2K>d-U2>UT72$J?Jn@-L4P?=9)-Ze|fw
z!=Vq2%x3UOq*SR}&r~i$|C8_-@is5KaUWS)CA1Qn)Ad8<itH!Fhb~MXJHwA|Kgf&>
z4o%agOw@a3{isst!ASa_F8(6ifN_V}*Tg0J*Sxc^Ps=pzOVTxoF(CTmwX8-_{Yn>K
zj-avJ_9fkAT$<HEB1lzlSl;BB1mSL^v1u~!BEiAvRT3bz_!1GUYa%fq(s9yw<Z0Y1
zk*)!oaSZ(tF}jA9&+Bw|o!v1dFjfCFuL^h~N|!N#_m1CvigC*yt}9eOLUV}HGO#uI
zgEw@i_TBP}UZg2Ahfg2OH$*y~g27+$Iks+7Fj%5Nz+ZP3T@}Lx1GWy;GI#Cp2Bwb!
zO9I&BLDW%g<jzXN>g=N+ACZyI{xY8v`Vd%jOTHo;8bLP5>E@BK<}m!0_&NNwMDYn&
zI@_~Tuu<Nx5)cR<_Vi1IGB!Cv=M`OxHxkT072^fkG}C<T4}K~3^uik*m|Xk5eY|n+
z(qy7MMogi<Atxh&?B>->+mGu#mGcV91Z86qO<(!FLlC*=9GTAL0!#3f5I2`0nTb<-
zGV44VJSCY4JXqp}I*ZdYF!;Uk@n<F)IC&k3ZGvd883#A3q(`y&gig|Yi*79B*F9wu
z)8szYs;RpQN$(!b)|0``VBB-6rLneaPQtc~%U?(LihnhDE@$3{Ep`R=B684s_5S3N
zcl|}lIP66kM&>IKXlY1M)@e7=+0EVo8XvtLA-;KTkW}XhKmZ$5Q>M1S6ZZBMp}u1n
zrnUJkO;>Fnr@NjDqch569NUyNXJ_eZ8Ax|Fme(^M16%;rLQp?$?;kvBFkM5E%U5%7
z(2^h-6hL8vs?(b|_t`PIhvE2nSL0~W@~A7VU7q4f8+jx9qHR;Tqp{v|KIAw|@7-QR
z?ew&bjLT_J83H+1V0HM%LID1<QGMMMN^F26(G93V2Z!@74xdM2B#xFM52gV+IE^L~
z*A>dcx70L+>qVwK7=9e7USuf!KRimqiKXu|KEIk*0#KIOZ-8H8wBSwCAPut`@bZR1
zMY1Cc3pP3E3|n8W`cBO%#@o=?S5S5g@b85#UQ{m}a>4dK^Q3Qqe}{)te5R5;$YU=b
z-B4~msGn|$JURqSKahR6vO>dd4T3_<Ashd^sJmyjmQ+vEjuH6u-2WX-$%wOMNSDOr
zIN;puneSY5DMhb_G@tQTC={%t4p@EoE?@r=Xa7-r`ILX6(Jayb^WNR&H<*y3Qheei
z`)oAZmx&uG*JB&mQ|PJ?eFOuKMIg#D(e(Pg`Kw9!Qi+;<18C^l>Wo+Iyt~9)Sg}r6
zMN^h|#t+Lh4fg;Y17E&4^dbn5ZEa=!&z|px+EOnY<X5%g4c(C$smq)BOjx3YIHC5H
zYA=V}SV*P0N)AwaTg{of_icBNq0RbU_ZApSsNNm@?C!zi=uW?cOewAg;=yxw>GdFp
zMCL=i8%dP|_vc9-KYMa9`m;)^5!Ll)#o3F2zlG7~xirg3p{v@i$s&Z)vO+A+{OcDT
zV)7&{=R@)tAyZxr9gkrY#CdE?(xdH;kb~f4Qj82^04}glmi4=#ecuS7ce80uQ>-Mg
zhC>7=mnH=thw7o#%j{wjxBN$?&iti3VY_-X9z-{Blhi-ODk*!=(|;p*ptH{u8w|q~
zp5|VmNmoE1u7kvvhSJ&&4pxxf*zXN$E~l+5&b)#-jQ|@TEjlXo-+SXfvd&jqay_N1
zB)Ogl^0PnX<8F+8hK@>-V>8RT_R6eJ+}>=s4NFIwAS@xG4p1qGXtA7Z(e~14Yn_oi
znM?5<3)Dac=b8e8Yx_~8PXkKsoe%C$2~othYkS<+ce9piiW+oda9QI}CN>wE#mk@7
z^I1{7BoN#(9|EXlv?eqs+dW2M!(?gr!Te^y8#`-9*HqD@a?xZOyHl?scIkE9L0Q>l
z>*FMOGA?dQIX)Z1VjMz20qrY-@EV8B&Ii_ORagZnl%dVW*x-k3&dp^S5Hsb6ZYkbg
zOq~8ZIabi-xrbR&Ghs;-wG6zRKmX;P_kPL`?&irekiQ5NqdG9YiwV{`cBT59V*ss0
z9`J#>?b!06elnnLZeHG+_c^B7GmDt}H%fnVSfz0^bR-nHu?o>lIT5KGhA&$%o`B{<
zK@~cy683NB?y7;~ZOW}a)>CMQ&yA<LR0jR&O2F5;1Tfkl{mBx0u*=w%L2hrqfvg>_
ztN>@bzzs;@CgycvEqNF;G{||(RM_SgPzO%6K5m#^)^^38B>#dr`1O(G`1>_kp_Ou$
zJgyUz6GAOP6k!49BF9XOahZLKT}=*^N-YC)R5B8zy@Eu;0#7y(<Z0$lTzY)$^D0aa
z)}~G<r#_XD1*V&Lpl8;;V?Q0pXCOnH`MQ$>hfBO;PPhMn8;Pi87f}IY<P(r#(Zev+
z`?9=X`Q-{_$0e5}Q~b=czaV3w0T%VNN1jw<hxJXeFkJ51Qe9uAIq>u*1RAs8lDCQm
z&M{Pmd^i1ogaq26XtC{?KPb&?^AHa=e8{7xSy}Oh8gHAwx`-S!B^th~Ch=jPo*lp&
z=Fg8?k0FV1BP!sF6!{2t+((O}OB=gC@q-ma8xXCX#yN=8zw3)a33@R-NRUYqg+uc&
zyM(L5s*B;8$Ao`W>%p<r*0~lI*REbygkEF(_Ct=8<l!sraq^D<J2yWe-$7d^@zxv~
zylI())QD~Zw0n@@ZQ{?8Keh)53X0{fe7cCEa=~c#6E;>h9lF)(0J2ItNkki)hsv>$
zTv{U{`B!bpHiiIqN0%tH{ZRAjJWlFyCUy62*=@{1KHzvbxe(8+OrhMbU5_MveJX_y
zT(;559yP9HJ(-Eup~eiiDCP(;jtPqh48FLbBi3oSR%)z0@iG03!y@y;gui($ZyB&Y
zK22fWYT+xl>s){6@_m#OwzT+B_Jc~{gJ7Jv^Nkx<#;vYf2b_po3rXg*{`d5W_z*m+
zn9pN45Q^uD%7|qpTC>E$GYs+0Pn|sJ_2F|EiDOoGEDnCyZv9w)_YcqKh%duP;>y`G
zRr$X(MJXGwh<)`gnrCM!C>0SqZebvql$kL)>891`Cqwcxzie)by*prC&6@JlX&ysV
zgk;Ri4>~5Aa;M5G8u5C_O<6OvU2(k}Ye=JN*Gr()A+QVpsgx}*)2m+Kk;Kn~s`7lb
zXjECeNzTj<zzYLT>=2=t-<|&nGiy#l9<LJN!PN@9BLTHW?efM#crCFU8v}XrmF_NO
zqoQWFmU+Amn>-opF>TARtu_=FxCBATD0p)*>xS4h#zehsIedVWnTw{l87F|Bzd?|p
zosSxplnXKp6Tf`_T^P85gz;m=$kv4^5D2@q7_vC?kJ+rocYvc;qFm!=fw&Vnv@`Bz
zAj{0q2DIHrwe~5-kl6xNX8jfm7I{F_`otZ}3N&$N1=Vl1(K{2AVq5>h)JLl*fm
zS+M_Ry0y7IGSI)4_X$2p-s066!#QLb-cQE*Q~D-(zHP~atz#DiN=<sg9^lD92LX0o
zUior%COu`BqZSLB$32;p>64Ue-qm`YZUPC({<L(L85D(<OQ-83F04}>sYozmUs+#w
zOT7va^g2sv0upWY7qitgpX%^zlEpDn>$VX!ApDJeF2x!GO!xt6H1z1hS2WO2a59UK
zQMN-m)(-KSug}?!ZPN06i<0HTr~BbS82!gFFEgUDd`?ZF=6IIQB|&bGdRU-O>OCh)
zxzqI?wtxDs4;UOS>e7{__9Zr#Z(NcmS%|TTyk;R;n18@J`7a4%&x`-C?s<c$s$H4)
z*;tuu4oL;I&BI4i_neo%K{^DMS{umGnqb{A+j_p|){xe+3zOzeeowYKm$M?lLS{Uo
z*t%+e3g1<uR~N^@abkN1%SKBu|1zrCf|CVmstrD?fXQXu^fK5}b4u)$yz|G@xJ%oN
zqslG?1Iq+2lzmEcM$P22mghvh+j*T*TE<C!9RraY3I#^=*!f^JiZ9JVQ128!)+2E}
zKs`yH8iVE#=2L(G;$pH$yK0&1$)QtOd8lJiK{Wzj<$j@)a64IS>HbCY{JQzWDvUT^
zoIlseEOTTh9Nl;>WBVd()QV<xQrKq6@^HN;STx2-+Vu<x^8SXPr6!AAN-dX*selmt
z$XAIJI&u!<EaLE=9xX-UzvJ)YxF)%uh%0x0Q?!g<%sBp*txftFWb||8Rngi%A+A|+
zRW;Te51}yaS#7BIN!RvwG=5kVnp4a%;R>=i{y<LOm^Jwa2R~1EUA4gpXcU(yj6?Jd
zhcaA2SKy(YyA}kbgO~rR&lbOSXLBMkMKW^&nR4lIxus^|B~Uro`1N&L`O#Ee3ZQkS
zS4hxef_T)yjv6_47{iDTtY<f(J~HCO>w!%2K5HnSJUQ3FY~T~7$}QW(#AGpD<IFLL
zip$d=s+^jT&}c2gM+LIs+-zxW;wrFe^AP82(W0%T22$||7RXYCV_MRrOoi0vzg*+)
zd;olqR1qX?F&I$+s%yvFLqjGUz%Bv`V|v4Vx2jYe?q~Sg)S&#Cv8wv2F<=&rL*69$
zc;tgq7rd$*ZkabL2&zx_ip5cGM~4{=xBCZ!bJxocM54C61pDt_|999PwkZq55R68P
zM-=Y{I_S|d1sshKDOA^Fl1$x&fel~_@b9ClsJgVCG<cTJz=3ty`6Z=;IM#V!Y1EHv
zst?~>JOoPE$=~3Yn%Syxi)O>8_Mqe8(ZU2jig>A+<Tc(FZ%<dEqLC}Ufh?c^h=r*`
zz>S`lj#tEeR0R5q!`&IvY0^|){mK7@spg`_(58AH+m?v_vnnI#<{C@Z?S+6;-!my=
z`=-<ybE2<*?b(jAU7HIF?7Cjw-YN0~uE0N|UCNZJ!f|h7?6Wdozn72sfFB|JL}{zO
zV3hwv|LJ(Vnm=!>iziJk7xO;_(?^X|QHX#e<egt%V=`b}#G@;1opzWYuPqi;SN}a~
z4xEaI?iqm(j<S>CJ^e!zZZA)M%T@lpesFMR-nso|r#)YhiAm_`2BWe16nR};K(EPX
z0F=q6_fj==5#<Yg6H24?%aAoc5CWTDDvgrtLC3C1kYos4DY?X*e_??5W{9g*D$Vu(
za<4R#LzrR>uT8CA%eYnSQemA<<qOltRuq{IAfCyC-GLkE){EXs-3Bg3U%`0$UhyTZ
z1^oKdTqc%S`5-otko}pPU1v?#iRhcec~9##SQpexE(5w85M^GdQIZ+*$#*{P7Jb%e
zwxwXc;%73h<3u<MK!*mj=p)#1>~YBn{&+@-^8?v{zj%Los7HO53LGwC{vgXpdbR?$
z;uuitKoobx2NtvE+Yl`C)up5=uAJT~w^_net@Iu4wjKmUzF@t}WWxf1*JSnnx<#Ba
z{d{3`)_Jh_?o75sxh7&k_vOXjrZglOF+QAb&*UruCq;bmP1NE^6^S;hAV5mwd0@H{
zauArsrT10#8pYOI@2{{Sw56r+l?`pAshqux7E$Kv=O79(>Q~Eln~1;4MQh>20Z3w)
z`zFRiMOTdBaa!KeE<DlD_~d%HwWaJs_aZVrDJm1~*EjJFmO-MSmhhI(g<%yE8`9nJ
z52_R7OPZ`IJWqqx10vVMf<wu=|A%>B>lOLoU`}J_gmObX$jz!5Uwzz>v^kZ7j=~Z*
z{Tc^0#LRC7e?7Sp+FE3M8S-o2&hqaSLB>L8>zN$M5qBbMHhyEatB32A7B#S7(2J@h
z{XR3}+}-v`40iOUD8VUNo+eQouZK=;w6MSASX~<(+=l<r)pIGPU@-O6nXbP;wV<YP
zJ+12{<;<)~7Mkp<k08{2bGc3ZR!@|_R2qP~Z(KG@T}O3u<v*g5^u^!^Jr}TI4cbua
z&Sm*%y;M)P@dJRDJYe`KU3-}1j&vk?HYufUMDAN2y=Zu}y|%^|-QL*v#p=z0^V1m8
z2-=VmQuM0U&ZKYj)IMK}cN5KuQTSq3jl#)XZ_Ars>UoXPy0PWlu(W@Xy-?A~Q)X^`
z!P<>R=99l<?2CUcPeF}sUt7YH+Lj)_ZyQ1TMW|n7xaBi^i?RlP1FE6!#~14A>KfNG
zF%J33ZbU`eE(ZHKk2N2PJ9QN-h2>J49_S<ZOH_u)!#=I^c(U)E@mVz!jyYI~pF$s>
zt=Sa|wb#hO4Ir7E@JjSgygHT`gP=%t7Yx@IXU2weiJHs~40w()yIxHl;ASeUr5jBL
z-<PyZUy@7i0!mFC&bTEX{+d}ig(L&MGJ9~`-dGV5j-^kCgONQj$nG~!A8yU;KFKSn
z{}Oy*l6@Bnh~>-lH(UD`AZ9f^3`S2+eCBg~?stop4#dEkc|7_v&J}-m{<XqkAg?(#
zJS;4?^scq`L`>qEp;>26X_Aqv1Kz)aGG4R971M_Tf4XhLNJxI>mFmlHiV6%KieYSo
zQshN~8ULJXLn1=`K#r>cB6R%9$5-;KzNyAS6S5;G4jhyHrET0pc4I4xMVX`O-Mr5z
z?^Tk5l;V^Nbm+k;yBB4i8pmAKJ%CY~D}TSwjI-}WiAE{X1geJhh^!E{TmDUY-_qBo
zm_Div;f9gY{9<CdeqQ5#UnGjbclgZD*f;fUjl}3NURdBU?Yf`;DkM2&>}^jfltu>7
z66A4pXr1-Q4MxM!oA!v2&7n?JLTq~R7W(C1XBJ(XFzdh*Th<ri4Wb-)EcOj_V>Mw3
zEQZnooTqS2#`hj*S(LuwdoP?73p;i7>08CpkwnOGH-wmhl}+gBz*SzSXGnuv1i9mv
z#@@dB$o)D>PR5RkcWAPUi>Q59vtC@l;P_8hB2Pj%X?wKtXBA<<lnbhNCZPZ006D|o
zc*b13*Ol<BTkHvUfG<VjrtkO{pqGrglS?FZJXRQ~J1tv)p1aIL3MSF!F$@&5r4Cn<
z9Ry_i22Dp$m-p&Dn`oTb)Xgv184k(E`!MHc2hhF;`u-H~)~C3vD$0yoVY$7L<Q?cl
zX@&V|lNhXsBYxXPGM&w*0zpBLJyOJ<Ajse(CnfYsK<hH+Ds?18yc(`8{pJH?vr^xk
z^)ewq-rENh!=XRb+yU*=tK0kibAQ&>UM@#53umDS-;pz6>KbZlr)F~<-qC>3bZY8C
z)I)*Gf#DZwESW?;x#iyxW^pN8Rt^G#;k2_Lxa#74((?3~vkBHF=&gFQ2!-14W|XGO
zsluTgXv^2<?nFv6aFk+}kj5nU-f<<endGrNOh<^06C=cH|8VI|`_$?qbCxG9+w@7$
z%(L{NPE*4|)YV^5%BTsfg^J*=3f<Tnd^F7={OoiwJ3#y`&4sqv6gEJ5Np?dF6QOCz
z@82hZClH%ViaR_63+<hm>l{z@@mK(QIuH5s+bHlVe>_$u)-VOJu)EaiJ#hx`KDy-Q
z@qK)-<NPh#PHfKtzEoN=d|ET)A^%k?l0B4wXk(b<IiKO6b}>!1@O_Ce9BuHBf@X!Z
z1S*c9237e)Dy4e1Z4F~6`_%P6f7g)4j-RcA{*3jHV)jRsy}ehx$^aj9#*vtn4f&;T
z#`NKtDMp5R*v=h;ihjK%@3RdYv;~R&$8@CD>wjxqXLB~XRUHc-11>+?3?La(#_)y+
z@OJT}!}y~-k}C+t578g!s4D3f+v<!k)xuJ@`bPt(22aP2SG3D-5I%eqHN%Q4*e<1z
zP~w{rx+S1vfi&3(6@nGq&_*Vj`58Xroy8aiR~|w&q6Q1$&oL1Z1U@*S9O?j`nAg~=
zuv;N>{9tfw-cs(yT#FxaAxw>+!`DPu2m$Q9o1o@EddiWJB3#%;T2D`zFW$jmQO+fq
zdmf!De$9K-+DHP+93@Xq*TPIOLFy>{oo@NowI7n#JN6$ucFRK%#sxUF3pP#(i@R(F
z2GS(zUghMaW<-69dD&Y_p%HkznPDP#2U($peFK7y=b)1bi-A^>lDRG8Ufe`5;(WD#
zG_Mf>)#8%Z4utK9l*fxCFm+{d`<ULMVsv?p0*LyqPPX?&O62QKdB)taj;%Fdt0cLH
za?M0viI#&@ov(zHS7fEWZb?5zy8ShOx?X`-FG}()+u4s<v0skE9WF6_QA99b9m|EM
zlrA(wLpAl;8{ca(A!Hl9FcyY`2tq94n5yE}_hvsH=392*ZQj3qjjxu^7no^Lc8Zb2
zYOreK9(41M)#abK73G@JC-u2TmUQLm!zWZdydw8?IZpIv1X<qj)G~R8t6!1Qv$3|l
zwY!@q8RU6hTTYk4Cp^54$G!Y>FnJwjMBQ0iv+}zEu`e=R1jH0)PDg<<-+br7($;O|
zcp$gA=4gu$7vCI5x)sIy4#)cUH7Z~?yuyW}%{Ve!=wADSv~%zD=`or6Zoky!%X0JY
zC>2E1F1#7*S4jmz#L+5Ckp0ppKycEt<uV@eY5;Yb^h<UT4<%sonYf3-8urDl&Cfp|
z9;_|Q?gq5ksf>LM`hP9SLE*+D>h}H7v7^E1!xPqD&DjmUjISTFJsOogT9C8h4$PbN
zs9tDpTer{eSp;x-0W3o${NG=zlP2;d@=gOYEAv|`hdv)BR~l%I7_7PF?G7$SYoB9a
zDK+`~jQ5m+Fo<m?6&S6CI}&9HS2(=|8JUJe1owz7QE@m(MBJ4QXb_#^J!dt-_a?m$
zFOloU13am}gRXgp6VV3dp(83bX(oWcmV@zOf0<+>$u=KgcZG<HDyMXtyH{|#>p=pq
z-^s2}np`w1RZRs7yk#}Ni*a{{f?|990p1CnWM<C;C?5sn^<eedHRoD+vK2#9j8RSP
z^n1>h`M=U(ciD>`13m=p+R@*gOhb_6%$~Agr+abTt;pFiMGm|yZ&P(+D2;8ag+Vr>
zQMT2j@agjkLutT<EKcsVai61ZtZ%I-?UZ@~kt@ZG@Azip-9JGUiiDqqOA{6${Grit
z|7H1n`Ycvm-KRjGR4VgGW;eq514XVqy{=&S5(qc=TkR!mOUEpceI05<{*vzy^=6=x
ziPHJ&e0tfpkHgVUqbQBkuDQ_Ba2=vzcZ{4sNvUUR5Mu7Lc}fUZ`JMM&W2<e>^B@Aj
zQ2k+MEEMPS!7fFS4weDDj1WAT%k=g;&-_U*Wn<A|phgiv`X>+EW|`U_bY5yP^dCU^
zO7DF+vC8!9l%mLMe2U`ry9WG_4>XkV1>pb37*bfFehCsj{cg0y_k4{^i+d7CBY<-*
zw|prtCM5wD(0bh6{=}^Ql8851yta2L2o_4Z?tQ;n$2L(XkKP>30HRUOSvIjd#lJ{t
zClxOyKqX5)YuJrgR7H7PlWV%U0*0&6_Z99y6-@<By%bhpelwq4+4a@UI8UlXUUGm1
zcbtz$)a@S}4uQe+^)22BFq=uiN+ll2kY5FsxL*}Ua0@05Bh$EFFYmQYAN!;B(4Oha
z+dY*eNOWc<7q24C{ULkDy(LYje3x)e0~vW4q~72)k7kQOHp7YyHK@bbcAWl&6hrAL
z)z~SCv5X{{A+9nK_RX;pH_uKfaTkRDv(O6^gl%k-)OzTliirt?_auZR-Je!3yz}z!
zW!ps%C!w9oKn=PTU9$Zt^Y^oxA7vEtfeQ{o6j!Te0rRAAx^>^Zoj&8Nqw>FW@Z-;*
z(aZ>-53j$6P2OaJR{JAv2z7cjbI$|bL>JUlH>zt;gpi9aDzkP=A&g}DCxcz-lK=U*
zuiK!Xr3v_}-_NfH<Mfo;t=XQm9;%)eC(P=`SpB$n!+yc}>$gW8a`Qk4@@6Qke1Fn7
zO}g-Fb^;SW)Jj4D`XzF^;As75NA|FPP81<dF!R6VduD1IY{Y=&V6dx~5K}+CTn{vV
zlLt#<&66@nV(9Kqo8ENqXb$Xfu1rksRlxdqRNzVX4(z_b2uT$uAdsgtv##D1cWh|(
zB!=Y2g$OJirB!OAH3km;dHR+_yV}|{wSS)Cv)UL~Re2c%<a(VCai{b7Ku&OKYimPw
zMc835E&j|a&$R!G>@C;7%e;Fcn(LV+mmthg-s)5OgD)4~P6BC26Ww;yJV>=md8KuF
zhM$Jnb!^=?^EIekh+yn*fxs*^ED>Z;lFIAU^;UbY#j~Ax`P1f)pNgR1SCumd!)4Gh
zd`^ftE2}i|qR>TJ;Bu2w!Zi*7eAj<^PI)ej@0*)O(}DHnG6ZM|+z&hOeL#}w<F$BP
z+{JIEyxQo|CYb8EZv_8F`B+x1R)3it%ssn}asALzbPx#nwZ|bQ>}|7AjLpTxj~<8P
z<u7%nw+~)cg7^_?n9>v7u`<4MDCXhW4Ywj*3lTA|<(8W44EI*ML_`dqgP53COHC}U
zX@o=*SPv*-xGgl*2*no*Ng?ZL?sslg5zp^M@YS4&_vQ1z0%okAv#u1whV5`>eLX;R
z%4WJHZ1;~S`^WORgW+x4Rh=u19-8tnl#3>uIUD`EJ#F@vEUH|$w<TQYRTtl@$<!iS
zaU#uL&=H7`9@>Jekdq>p4E5mdMDpA2JLMRI;Dvp!$ZzL+-OSL=fa@vWP21Vs4auj3
z4C-g1n;IMKvqP+V?~Vn|egL3NUBf=j+@_Jh!uPL7#xG10xWcZ09gXeH(%zM(n$`G;
zd>@^U8CB2b?Y`U4{T*nHspXlSo#A8WmA}bc;a=~k_37hx_$!ioX-Mu$lBYJr>vn1w
zTfJ`u$yjxK<JzfLU+$-5Md64@o?XI=$Geshvs&D^0oSEhwFC~&mXCwmNAI_rbSpaa
z8jT<DY||^ho`-#UbUlE5^-}+XVxj)gsex3Dj5FrC(r5#wk+tyb!G3FVb}s4B6Qfcn
zO|2^0@$VmNE46=TGBZOGISh{pzZhfwwWq#$V4StFC-uIs{bMvl^BytLwG#wRnkaf(
zn0#{|40)*Zus-8e=3b4-_iwIM#CnS)|4%6*THuBmeXOw(G<AY;VQnkv+RY1uq|MwB
z%m&7QScx`Z-Kaf&_sYhekKLWWDnqN96e2S6+v-OC5H-ZM4DDx~v^*dkuij8!v!pfz
zoDNoFflq^=L@s{6Ga#j-Z8b$gNw$CNegp`EZ&2SKi0@4qye%H*+RSnc+2Sv9BS8P>
zp68@3tp)mpN}W9$xkIPy=O{OHG-k!#^G}spp8D!=id7qajbS@x{XTFys7s~oL#yKB
z)`!yI`RrdbHLx|1_42;o8k*&xaxpDvTWHTq#pZtG)<PMmpTXlbngjmdQqv+_JjYEn
zQljggGE?-)i+4Qvcpk7GL>`PD4MonJJ+Pg+rmcFxq-@q#5P0qf2x>1Vr*`@Zh9Z8H
z#V^rw8f5Q(>l9)_eDa(g_@0}y>tV6-GWv`mr4yX$jnkDGY4wN6OHJx8Ohi6GOAvO0
zs4)@92u2|hSrcgMkHCE*)@L%Uxkn1$(f+o)mD=moK*J!i3f_9unLO13Vx2f9-UVjn
z>_FRjen_%WdXq7eAe^&8A9nmT`L|#<w8fu9vna>|HfJ*+%F6uOB=fzV*zA^~rDqg1
zigDzbiQo&0zYbRgZpcZwQ&X^L5LFg97=O^Q`yamcZ2=dje{*y5y|LIl)3LnT%Z61(
zj>*nNVlOGzmUv*krh9KkEDHUXBp6gwj9BLqCYK*-CiW@UL4e$G!rO3x&y+~0pAbk&
z6dl8G1G6L$F~2Z@JU~e~NJVB-7g}nn8fC1jURPdv@ojm#*}j~3;r~6ULuH&OxF5mE
z<&HFS?KeW7lt{Wig{Ah=1BFD(To+Cyw%UXfY=O38A=Wi--hk`^Er#M7rDLAdEEFe+
z+<Clo6y3N^yQ6v^%X+6jL%E!))@<ao{4|Ji#ujE@qF*rN?(*Y{gfzq9^2p;J!Csnr
z&TC0d16jj_yK%*qv4PgVpH&fTr){jW6n8Im^4xZX3|TbhrMwghV^hy9Y+Sa+R!p4`
zQe#$ubDX~)DVn~J{ns7~R7qZ7qxtAK9RT9LMGOe-#+n@dc+<+-`0c2mStbn0PUm~v
zgm^DqV*we>zx%fGcmY#xLgTL={PlSQ=LK-|o!DP3FqrQ~@y9fnqjTG9_hSunrJlDu
zgs_(&ADxY2IdE5q+Sp-%x<nBzw#DmL+>iOZd@b&x$sdn={iGiwcI<6s)SqnOm|x$s
zCtVSoI5UHexJ!Pl!!!X~8%whhVe1dMP7~D5?-a)K6jS}ovG3(^e1-o1;qx@c6~53)
zzE9+zguzv{wQmZ4-*lV~N_EO#c}AHc0V9-m#Lmy_!y$vxjF2LOJ_uA>lhvSi00s0|
z8@qMYwUxT7wl7<1hzc8*%yr<gY3BqR+uBm)!B9DJv!H!3y~eAi#>;D~IULvJ)vzHV
ztR)i`87bt5<W%6M5TbDP1~H6T)4|59KjnNa?aj@OUZ5d<Z^mcN{VY%#Qm66Tt|d%_
zE$HmFNzZIVdy<Ip#=pNiJHU8g`NnP!+P1e?<(3!McJ!C3EPKDrl@z|ZKdo`m#5M<=
z37t){d=tIBIwgJlVGjDw`}|`fyK9{5qux2&KiPbmPZYcjymLgpztf<#?!DhQz5n-L
zOqAu=n6h%iDM+IHrjEwme*((7k4v-{Gqzz6s}I?#MkY%cgan=3?9G|OE@WZQ{(;=7
z5Y%9qi@EFjKHIStg$5b2Z-e7Jb@w)|dD7L^cB~9z>p7Sd?Oqj&RdnIKX{UvqJ>DYK
zACm9xS-K2~iV$p)$+Af`xygE31R5gt_ZK<?zX8OdG<@MJ@cCdU-Cz5~q$c83CV?DE
zcTEypxU$iS2gW0s%k(3|lEul3?N_S{J!;}dnUlFucz_WN-`DIb=zYJ%Cc9hzcy@I7
z(OSTf&r!5b!z-vmg<8Pm=H}omb+)NS{?Z+N+eCSZW6UgmKYO=z1DfZ*x^sU~2O%zR
z$8UQ)vHqY)FZe?~x^YkA6Ev=P8$dCXWu(vVB!>&NtfKv5l7l0IUKrLdoRIN-_EYfW
zdD+rc)nk8?i?=~c+0Kqi^3x!%!CkdNaGJ*ypZkYTi+*$`)8HUmMH4dbELqnz^Pn~V
zdd!VCk_0x;1^}ZG|5ZI*gTAuah0%WS!CL`6-VA2mpZMV9(hT`Q+zs4MTS`?qPH*L$
zr;X7kX9m9e8-rC+B->`lU0{PPirA+VTjd)!T)&>Zak(?*A~cwDH`;^sgYVU&3xN@w
zMv;fh-lLc?wIp1$pW_rqtqMbPlxHp2cjc`GJ)~!Va_wenth9Q1#BI`~JC=y>+A=v`
z8_fnWs3K&C*b07$3xJ_+PR$7YYvgVJhh2?&BBS8coN29;{nX?8g#)XHgalR1y<EeW
zl})?eA{iUHbrl5SZ(&uxf@}OQ0qYMX8DYc+l`ge57gYTwsO2-Oan6%N@m<uQQ2&Wx
z=}n`1TJ{zE&1c?P`Z%rGVwpcno~#pzDW-4O<era?D!KAXlvg|POQzbrC1DR1N4NkE
zeY;XySKs)<AhnUyWM2Vi(^U7ONyDce^9nNGF1vb8#ZUgmm$cyK8KweUZk}GcKGb$i
zOVj2QV$)IMZ|cxh0MeX^pKgDVN}&(j#fn59L<niS5@G2bFp$|KO`>52d?7M;$!ik0
zU(kD4u;L=iD?TY`?TPKD&gx*fKV_IY{fZyiwmk)IS;<Bk%_Cx4H^dfOQ8xQUWDdPj
zc6LC)$P^oBoeiSr9~QajdZgTM3*CG!+qS<)wbU?Q1ow28ciSQPb6TQP$=>FeE{LuC
zJ1F<pd%Eqy-XRcVg{iha-uJQ{oAi1x-@Jc~$96s;{o=)orEZJp#U6RaclTp4T(c@?
z+Pm6!?IK(Pu9?fZalkR<<fHAeqX~_jp2LfbfBzGPohdYZGV#Ip;T<tvom29g6+?vE
z)Wz0gEPp_n_B_Au^&y#HKV)HQh%Fmph%-lG0Q9q6UEyw|%&(Z{#>R-<KM$>QYD}7C
zF4ojwusv>m`o0bRy5c8<Is4|xf;w+emvU^d*VV7LKRz<nJ!Lw%{761{?N?!eFXpQQ
z`rOm3+uUsf8|wjZ?K_dGg*7!b@e#nKp}t4^7IM?w@qzds@f?eLi%l!=UaXBGFT*zR
z%e6N-G8s4rgQ?w(5KPX)eBQf(l)S;4#L-nXNa8PGD_sxfde&0YuSU;C#r)m*TX*mw
z^Y_9{U2%>nYh@RSFXmciDJP!&Mk{ToGOCXk6t{VXS!2e^Mgk$}hBd@!G?tKg&f%BM
z*x7>)A&>s<l(E3VoN=5FNol^cgwkBI>QDk@_ADr1fn<O8R6o7cG%^9w!4acg_H+y)
zsuILWeEZi@=%39Wpdz@}78x1}V##$7;~2-)LeH_-f>Af~=d_Gy8m4D5wfDNZdfg*j
z-+$?3spB!<grDHr*OJ<IKycc9fgX`hu0!>C+g6Xra*yQ6;2u{LWv)efO>jY4d0FJP
z?@8X}?W^CB?%Q%07wt?bP@4D#uJs8Fsaywyag*KhA;_5Fph9M~xw41Wd&OsEa5&i1
z?JdXhitt}CED-UW6b{K`Qa*f1^s8gNla#x&e(?H?CxNfKN$5DYxtUjT6a{wmNLr(D
zKWnwnj8(a<d0phmxPIK)Y-IYKluai0nt#E4W_K+Da=4r)-5c}0aBRa;>nRhZabK%A
zJa31?EOfocAw0MQSDPR)-ShYBpRFilFU-Y4_!^%o9*a^HV!f`~p9Z^=QwwzvDoL}w
zXTdFh9?As*#^>1;PPKOXE7!*PQ8<=wrd@%}m6D%)fWq81kW4<5=c224FGR)1lnyNf
z>mBw4HS3z^2L_bKoCdzwDK~xFu~mbr${KF9>shkxO-GCU@f%m&mRfupek<~a{l;TS
zTE|4;sy7Q@!9-e19Zgd?C}g#DTT3q+2n#DQAVcKcYSY*oyE1jSCf3F<JR(9S<f^!#
z&d?~^W8!Lt;>5|T@VORz+GSS2WcgoQuXFQr3;GNQuPwC4jCG@?2|nM_Y!hvhr+5GP
zo8``T$JCX6FdLpQ=;P-G9-S7c4Yh`enm1<<Wb)`W@38LwE+o}W&>qTXPkuOW{aWa2
zgvMUM(u<1`e}?`ozPKDOA=atsR2aSN3j}9}LzRn>M?!}^3FYQ?v_Lx^)?eF}@dZm&
zRg&7F;4#~)dXp0py0HL4>F$qWrlzMOgZGU;ET3)8{9)gIvdC<6S$SIRx(3~cj&U2v
zrq@aJ?}i=EvM?MAETi7)xX6q*?s?Bsu*LSDnvr-82#*IeX|$#L?%vWHp_Nr&Fh*Jf
z-&F<q+v&-O2F{t{WJo#u`!^J?gGwO<YZu#S`1Hxu;}>di+Y#*=0~<E?q_a+2Zfm9#
zl+O+MR*ilTshkMBu-Upi@N_$c6>=I~P#Th-?Uv`5SQ)B%0eIY+B6b$Km>^><V4qkx
z#K=5%lD_dA{h1)u|GBgD2`j*(2|E66rl^(tMyb6h4H*=PnN*3$EXMV2j*cer%7O*r
z#(FN6<Sb~mZ!hPZUOBLVHt_xV?8>L~rk_tu<oUB?$PjsiX2oc-BzuKYHHlf63HDOH
zH$}!*{e`p_y(SnYLHL;wI}RHU%-b-~@A@DVbPXOF{9!0Wk{#tT|0po<+v;{{UF}4i
zzVo<av`J(6_fw>M8=<O&es*OcTOyzMxJLIi#VUJKeEaXQm|ut!`Q&-Fk~b;**i^XO
z!<wtwlNQ8#rcqrp=}j?g^FiI6A+}@Y*jos4zmn~B6~+ZF58^V}H|>KAh;;}NI<s$^
z{BAXl!MM2^o`as=1~pRVZ7rt{LhuQs^L*#lse1{EAHM#5#PUf>Lh;j@sLYZ|B2NEa
zZO3ta-5cL!N$^^Kiju1BV)V8Mi}@8&9W7!ZXzGyAe)zUK!>+U9bPeY*ST|c#NqLC@
z9KPuKoP{2GTWfhnnY}*mVTS1JS-QpGZ)72z=fS0mnwrAxgOOi9o{G`?eVR#6B;e;T
z3gRgR5mJV52x~&J-!gM3K1KRkhsWf86pwm|i}JENS({<fHa$95{T?0~TR$ztrWHeE
zzdF1rYRD>kk(cOM*K+!^QYc^4*>_7B<1L>3&E&9KW(o2WJHJe!=2VKP1MV5VcBq_m
z<E5>T<KGl(fw1+b+gQ}(^z|Hk37sjA$F@>rhqylTwnzGx76$7q|G+0bkwRA$^P#sC
z#(}RtvsIyLNS;MVdyG(HhzMk}x_46k%xpT^ec(ei<RcM-D|-7=5m4F&a8L)c2YDYx
zvQAz_LG*P3enQ_F7H%W|UVt9~Qt|X;+P?2mnA*Wwyhd-MAQ-U-q7y;!t1~^sxTWXq
z<*jtKwd!08J*8P)?@liXyv-v3mt$t3@3a3sAeW8rI`j#@5G~YW&$mZY8LNc<Ja*2C
zi>af>r*wJtpT@uGL&DKa<mL+x8@4a!o@l*O(&L3Cu!iZsQ6Fy)-qkjb*y$4b*VDcy
zG#WU*9H%Jc;PaF1!u&(9$5knm-avM9Ip&~~yTCZ`B}f1Jw>UMm-;bKRW?oV>gkD@6
zYmurtuwk+F*BZt2zXzxuMp;=rAi5Uqw>7xHU?ydpu*to9Vh!~BDYI=rqs`z8Ppqit
zakvt~3Rx}B<oHaa%`U{?Xv0$dergx~7Z|TjF#9GJ5@PG0%OEC>bV}Kdq2TAmVYiKc
zn!U^XxAwR0=O}r6(C@U5EiK4<nnJCU+MrTRvBafo+6qJ0buF>1Xx+;tah%_}Dj+|5
zZ8Eb%P$vElw?u4gZM*q~r~zRSQz^6kIkLdeSV5Yc7A*-fHgo*D-(|17V)b)jJs7Ml
z1)ID~U&iw};7T8wjn*uAR@BbhK){3}W&#Z3OMlf#89(Oa8sX`aCr>QyhMf_);+l9X
z8V*;s11?Z`#%0E@up%Yvf^vzU`03WiZ%RPQ?jcCIF7yW_m7-k<Klg+ZoN-&1CrfOg
zhw-YSAmK`m4MyDWr=W12z{G%V@}0x08vC`=M^oNeHts~AT_&t7E`F9cQ4$07kR@f^
zV(5N4@kY4O>E8D`3qxzBmxdLq%>V~A{ztLi=`^>lU{JGL(dz0-TF<ME72tv4`3<bH
z4s0BE+?hU29u99GAWiyPtm;~r{*R;c4y5}3zxcK4hT>)xg{y(=O+sAdOR|-52~l=j
zH+y7frtDc{ucB-3nQ$rGYmba;UtIf!-|O@9XMYrV-}meFJkN6;M+0L7e}Z&FRIBdK
zkmrv7PTThhxWCNu5ew1NSQ?m(tD{JHR~m7<oZ<-(inqwkn#i9e9$6m_$iatuJ^}Ve
zi7j3ij(-2`^S+~U6$XDv0k<kjMkho@lErM9;3ih2t{WOo_g?3DS<Aq^4#n_K3?THG
zlSU)O>8hevQ->y@Y%h<B{TS}$PtG`!hIOKzJT!E=obpAA<$?s;LuEI3nga^lCHU-1
zy4Bs)vAI9*uz)>@dPuYhyged10`NzpoD!s?jV{IeZu37<VTP3$zFbv>E{EQMYf($1
zFNe-W!(Tqz5sfbXqfu6uRZxekF89jH;lp!YDqO!n436W6q_z4tyk(i|nHW<TM44A@
zAH3wwNv#`?6T2NbFo(g}r7?bA2ccgpjm4pUuS`^I;Swz^e>Z7oEKQb}AyMsD7wys7
z-*#-4CN1P(Q(nZO_XH^;%Vs}N6!px`!w)sOKkuuM6*uThh9_6AH+0mtG&gT~EYz0*
zY3+3xny1-!U*Ibu<g=S%&G72p*5-Yi>MQPPNm3QvHQ$l1x8Dt2vrB9whQS~NuVaz(
z-z~)1Bh7#$=mQwjx232_tVdM9pY!SS{qw@&^TmKAF72jQ^Tk&(Io<|8*7MrwEqTUW
zDGV_1-Lpp=0jJNhxwsk+z23dDO<HT>>M=)je_`NRG&5W}S=(OQ?@N$pt@h|88)$Yh
zUkOJ#EUm8cR9u7A3qv4ZlJpz~pdU!PzP@e<!ZByV5Q7X4!j0o@x`MMVRE)kO0t()c
z_A;3TK$!w;(O7GxgL!5*Y#uA5F~C=0u*&HYXMBtaZGAZ%%3*-r^mRJN<3&{{^7;uN
zhgGbf9373NL0T(?i?`iDKq~`(Ej|M!os?Q^hwK9szv1Dk!%oM6I}wo{H-G<a*P1L@
zcDDK5jv2A5_c@PER1>-atC?G5SH+_Vr2jqanU>uO>nZ2~<BWY;fhBv>VTbG$Rboa-
zS=J!vLcDbg-qZgzO%HwxCLHQ?VrdI^>@JJ>z4(~Tof#&x9+{v=-=@$}iWY4yH++a<
zfQ{OulG0+LWJTy8{e3~|`Buw)zt%S!Y5<1&zoB^NZ#C{@KxS67YTx%lhFSS@<;U&q
z?W|}Kc=y3ZeKUVl#9^D9%Yzh57IH0i@Ns(T6Tu~bT)|ee-Fl2+os8$GQbmZ|7kwKG
z>-Evskj;I^t*3$HXK~uMal>~Pr&-B!ULgzv3_cfm)mI$eFrZiV{Trhdz3bBQ&(vGp
zVPOAvKt?(^_A|reGM^GwH-27|eZ9)T#d~$2TJ@poHga}6)PYVqmxKMUfIwek|1hWi
zt*=0iaCUxDu8B}0y+2zv9TPkgejCqbEcVxS_=_NA1g4TVH^KkM`F6-xX!4K{9Ypmv
z7pBGZ9W@#eSdI~5{Pjo?ZZQ0oigIY#xOTHc*vbi^Y)EIJOHo;<rxx`-DQ4$gyGf<E
zgHww9Myr=dF@{eL5~-}F22ITKjD|M~YVxbGIGWObhG42M6#FUpPB|Cf&Vtjp*g5^|
zS5BYllT=y!o}}_Jwo?Pa>cXLKW$5yG?e_mhb#=7+#0^ms<#p1R6hGWiMuP|FY;+x8
zh6C+b6BPO^%I@k6hle1_H#u`;CE%2;fgS(&#_1Xa0$%Gavvee&@RZfm)I59ktY2Yp
zSX@N#s%l|genN&vnRM0S3tLK=y-YfuW6>*q*roaTGs`r@S22)6q|IMxXm;r@7>*BZ
zDwtnB8SP6BFKO}zV)}{ywj%e?s$?c16+B%2$6ahOUM-W;4-{(W>b`sYm?ZYQA8k#O
z!^6XeclanfWV;g>zi@@&vwe4gR0hDym+;dcRH8%785>FG!P!T|Q{p)$z~85|^q%es
zjM@Jk+gWJ9sl~tSf7a)@5cy7bG|C4v>FRK1FLNW#0$fsvenc1l{SK~v(i3RNpISQ@
zY~JXGWyF{D`V@CgsgR@bnwgD5Kv0*|ZX?e%nMZ#F8Ak;zf?Xx1I90nGJ>X!D>uSs)
zTYi3VDCoPpTCgCRq?f<C(FE){h!F;BaE~GqiKYwFqGK*HS|A@yO!k`Z7V9o2nmW`a
zND=0M(zfPTre@HD+Ojfw`QU>Xvxx(i%!bEj`>mVugjkS4Tx@<`*D_r?B|W0r=z&*y
z;sAZSk;B_6*;nSGf6&vTqlOwcq6~|A-3<<&i7)Db;h^EvS?VCBP_#gJTpn0{YTI}C
z?)>v4A7mDVfwq_k91i+DL0m+$*UOldkQsLdx<ohLKzQ<A24@-;WuW-4LAv-19<?}5
z*{i7#_`)I(cl^@^U!})9kt1ZJ=l>7UJq?BAC7=haw$jAlD#GKOC#EH~1&a|GsHq7_
ztv-d>W=~inu^!8ewpVOqtDya~2h*F~?+_g;9(4JWQLVPp^#Xqx9`+4+u;5G`suy)>
znf|lHo4f?D)CryTm)Yy66z;b=yMT*h;wP~N{_)Hez7Gl?1)pH=Pn=lymE8>UXAgf?
z;XF9}W4UpbXdQiFhJEFw#Fpfs7gApXNOC!#(#FAo2WtLf9&BTSue65lJS^sJFIJ8S
zWLGtyMpb$UOK9e4gyfiL2$?B9l@?I>o%j(bdKB&}WM+N(orW61Z1e1l<XB(_oprmU
z*?~9W9d$3o=c3)q><Gl1QwwIA#n$z;Bb^~a^RJ-}Itw~~)ujn`Wlu@1`doK_N~g|1
z!a)Op!Dv0Vg%v?RzZB*u6!=B?N7;Zw#ukrGxXPUxrqBOblID1OUbuT@fcdDW%efdQ
z2ZPO3qCdc`9(6$go&U6ZM?&V+#77|~Zd06;CzlE{jVbj}pL|T+wcj;~7c?N$$}?V%
zSkvdz7<nj0Y<oEnYg5iO!CIyO>A&i0jvtj(IxsOUS-#$EoWo*^#oct~*iB$;<x5<U
zE&d6p6L0rt?kPWUtpQCW;AaO<^|7zCPOawj=H`GWYCShQ-_AUVfvCt=jA~x&{{tym
zqo)-Vk6j>S2b_(+a>=+$?Jmsvx2FI!2Bc3ZDP|SOb%o&2+_>&Vvq%_I8m{vFk6*N+
zTW^cj8<;Jc&c1S-u>Tp@9N6E-kg$);xq{*-K4kE!^^~epvpMKZ<~Tnf=0&X1aGhI#
z4#%3}UF~y^^J^g&GU^sV-N9tQg3G@NhUW8ADG`BEy&FNiB!QRfGq}d(M#?|3TjtH4
zzZs|?3W|!39vR<kRNaf1=JSkLuZ4mAEpB4MWMJIP4EEbA`M+hSbYFHe5mZ7e;%%D?
zLDF#KHk(S>eh0asqWNFt|9rg2RxPtPu7sOt@^_KL0%J_w<3A*!RZwq$QPPM5hLE7&
zyGh|7c5#0|xDZplXr{4QKXczD9*Hqx0WY|9AG=cSjv*f+%N}6K$34zv31PH~P#p3P
z5VZ&(_+mrrkZzR7l2LI;FoV)DlTzK}rs9;}PdR+Gu@5Z1aL=!0mdjoU@uU3O+;ayz
zr(CxbeIFqf*!3EmK$V$=MGs@c$hB>>VkSs!uy=??Bc3}*EYCZ0wi{Bn0+g|XSZ@LP
z%cEw(+%GdM*g{sJB!^zKbyh2}hs|t+$-vr|8FJ(*;EpwexG(oBt!$+$!6lj|&?IX7
zbX;J9_3GRA_X*?#;4uvM=j^eWz7A96b)s^yTkc+=Qd2qV2w03e4YfEUy?qgG2??#r
za5{A~Dr~!GJ&>1TUqY+nkeTnr&fWcb=&$G=2B){S>N^o*jYs!iHqc*Vlo9czaVj%M
z-l#Qv+*vj-{Bk*OMh?zqB{>~5ekWtFKp4H4;mQ`%FrOi@7#Z=lA=7NiXw~Y2<*VP`
zg5PB>J~Y(YUrM*Y)h${5jzUluOPf;_dI!iG&f@|OQjgyNg7_=)(UL1fv_bt;{Ww_b
zvTYJcS>0aO%=9lyD{H6?Rnnx@(ss}K&?B<wTb{x+{W05be9~)9zi0!-{mjeZ6t^s?
zZw22GY4kqSjkgVkK|VwH;sw6Fxi^+p{mblw&qh5X<%d^n(f8aGwRo0GN`mk9^kdSr
zlB>dpeYF3xI9G=L8GkjOLDf2uQs+gePk?Y+a6W*Db#_iczPmb<B3^De^WV0V%hc|x
zub%iI>WM(uzA!@gXe+CQ=;|`7(VX^7R~DOe*sQ+s6niMP9M|->e4X((f(HU7#oEbh
z3ViTxQRf&?LrU<nn+jqOQ-eImkLGvTWN~rb1QM0Ppda1>X;Ji(%s^}gq7rUm$6fsH
zO{M|ksOJvr&+E%2RAWezr#KhCmg<Q-_Mm4hwl?wT0aD}hpx>LDn*}b49N}jt;1_L*
zN#y!B7tRpJP*6jxlT!BH*&R)n$&P8*dQ%(8ANE1;%Xc?GpDZU~FXae`mjppSOCNKc
z*PIod540Q&e*D}^S@YNC>oOPFdZ0(?aPYk9yd>ZxU}tjnd_V+LU4cHM0XYUgz(kZu
zSP5cKlyyN>J;21Ks;YzQ1fU%Ztm7>Lqz0={H{+Ag*G?4kggd$as_^$Ua<_@ntB;u6
z&wi>f82bdmWPp}>H469xZ^1ofP<zn!$ceQk{PYV~IBsp>G}y+HD{ipZ7lk72F5M$`
zRQC5tDk*W{(NavjZdflDD$oevvpcij!Dw9=3WI9EMj3N1Qw$%2Q=Lu30xZWPS7egx
zESH7R;Uy*0&IR!G$`nG8l>1v5C*+7^=v<8V;XZ!WjG9?Iw!xC~u)($E{+PTFHL>v?
z#*^@!w^&M5dg?v2i2+bdhsQssc%@yh(X)3D>Uz0gP{y!mhF2pr_Fmekz)$4cReqUQ
z$$!Cw2JLDo(l<^j`@7pcLh`CJ*odmpzs*3mOjfeo(Fd!_?&+<mj(Hsi{Y2(Z<MkZr
zQ^LlGm()?Qdu{2j-Z5keR~kPqudZJ9_4T)PhNojfiu2ifPMe8j#vCZogPb8zl@{js
zUvtM>&nEp*GjR%@cb4fG9O)SFDTnJuC?R6`1Dy=q6?N728(*4<4nlHFuyJSJJmHYr
zgvS1;Z$PAA4~{JFGqLBt&GdJe6x`(>#^~1qvjErvAKUUo7)&0GjqzhB_ClHp{oIRJ
zvm0@nItu9)+N(y1M{rO)A3GXfeq|$*A7N&el4<i%%9g`2)1z!$s!ZUzFJpXo_(elZ
z?X~jFs-3+hFY1mxYA1!t&;*KUY<yE*Qd#K>EdghrDox{QdwhOG7#%$n%jkxR2#TwJ
zjf$Dj`?Ohne1w;7zL3&?r9b-c@6PMd4|}=A*I6>079obp)D9FZ^UKPON)eIpk5T{m
zWUhn*KR}*=!^XzM;fxoT<KP$~;-@(GfIT&gKH?6X(1g^=`P(n9-s)3%D?zFPojI*`
zcgzHt+*qpFO9Kni%rW~ll~Ps;PJ=H0i$mQjy-JFTCi{I3y>$k{Qz;)H33)>6m4Z=}
z?#P-O0?Tu!lrR1!rms8H>V$048N4*cXFQ`3?JB(CZ>gC5^g^u#5*P%)Y&`9}hYdL1
zPaf<cpoI#+u<|z|W%NyYvu4MTg@whTDuGjLc<-#k#}LX>%*P#V&1Rj9Qp^UEng8Fr
z0c#Djr|+L-H^Re_8{MwTAVns;wA@8Ux+c#RM>=*~ccu}GEQ3GyFMQ_$tu4P-a#h#q
ziSl9-<*`5GaH&PU9Su!x(6}m1y5w*9mGjl}e*s|;=ld;(T*tVY8Nohpg_{pS8`>A5
zel~q4Bg{d;bOHROZxD93@m9RP(M7~?l9#u?w=emC1HPum5XjS69#<WF3qfph9gYb(
z@l~h<DSK6=jM&E#3}DpjD$zFgAzEpC*YvwCJJK=##?Q}BAv44#b$MmMKcH4|!}{}_
zipF%uLdU{Fu9vHyqX$H^bEQcJGi=N@`2>mrR%`IB&U!Rt76Pa<#Z|dOUx3F~+DCG!
zigG+647GPaUaWXidwP49_7jH9Or@$60pRZvEQsL`1OFWWB-r<RrYWRTdUT0SqK7@)
zk~2)Pd&pitaV|K>B<kSk$iSfp6QNQvDknqVv@<;rFP-O}h2@%=rC4*G@*<+JgM&S|
zHUE~w8BBM7RGo+k`fw&avJdwT@uQzq*}t9!`%d&(aAp@&ip|xsZo8<d{*8=JY76g}
zkqc0`d5?Mck|?QOIHi}>F8!YwCP`0Y{I_B^yM9`}9oQF@g_i`QlQ=Zh_EuNDvhFrq
z1y0|gebtDs#mlbGG>&rvHeAS2=VvXmhfTcCCadr5&IELp*q2O=jkq9ZHxC-7Wv?p@
zyp+y2p@OiQ<J3Yb%FBZoVoV)4Ri7Z;XVMvpg-B9Gw3FiFjpHflO;1_JH@iVGIhovm
zHtvP#MZ}vp<gpTikC)b$ZDK2zcwIEI`n$q-#d3K?w`z{R{dH#3!)HD%Xe_to){KRB
z&jtI%2uw{+<3DB|irt<}X<VI59-es<G_XDWRH^XN@4)f<ubxrGtEKS3<zI0ai=F*r
zZ4Y{VGc*<|q9+02R)IegG<+VK;dE73tLx=&p@R~q)=bJmZb-c*Cn;&P;PdA~L1^)X
zr7uAjZ!uJczVApgepx28<#9SB!yoiT6MTxpJ4SbofU(|DvL-;raaNAa`VE$b=y30j
zP}N6AZGMQT#MQ1bIbtIUl7>3?&CtO&D4sbO_@jJW{Kn-i>BjGF*23@&zMz~SzVQmp
z*YVQ7isw*RtQYK|yVxCj1MYkAOUTT@s3t37rq+vt?Ikm{k-5$clB8Dt!X+N}BGSyE
zpM||snH`hQFt|b|+qhl<|KEoTJYkS4sit;IU43k`$6MOmf1X+Vhn*&8XckF@tYmPV
zxT7(^-Pm%vT2P@)7(Q%1G$l`C>uUSgrx&Q?l_8ey*x8w8&(o7n*=MQQ{-?bKlvd4`
z4jJOG<R3N<-`SmB!Q*+l(dPEnw{_t~^mXhMCI-)6k|OUU<Mr$hcw$}Itbg_$XP*%N
zNu2iu%x6n%(;@_2E(%Bdjr&3ieL493Ec?8_#lFDA@cetrdCS4tlBV0<LAuM$n_C*@
z$u_qilc(pc&kBmu%vL{XD{Y29WC4HNsJH$3v-+a15{Mrs{W6UfHu2Uk8I9f~*d1Ws
zweJ7|wWo4UNyq5AwmF#p#I*E#rMl&{0M5U!^6HC$P=`cj#7&mk^?F=q#{H9Wmj+u@
zU2iexZ~vB-rlzeh#m?922GZKpEC3=?zOJW1oN3{8%Wn3PIJh5QgNTfnA%E-xsQMe*
zuIxRogvOG*G~NaeX4U7r-W~yijj8ve7i8(JH8L6ejqzStPP+f;UXH@4#+tpwC-Ym*
zD`Ykq8K5&o@?Y)<q!MVY?2qvDLx<YFhVW&2a@8yD`zg_bdbACRBpbW3JYlKj?t#{t
z=~@YZLK4bz=Ijxg9pC7ED3VryV|3boK;JPsnz)1*3-9YEfz8*0q80xZA{pDL&R8RB
z0S49i(UN4_NhK|UNfc(gPy9aW;$s9g*MjJkY6c0^yk4d2o5n+7PihkmFu;~aT{|D^
zGO;hqJDPLK`)_8PFo|v}?7B@bO2@qCDsnfIcKwcWUJ3icm4RAMqZqRfF3wMId@AK{
zgdM*h2cmo6wdu8Wf^~kcWmFn*)1gc|PCJ6#in<exOilms2i@cqIu<4QOBtWerS+(b
zF=t>4!bp{%#Y!)Rpc6TTMGzWY58R_*jKpA}U{j~Omo{QzpHflf>jISU;<7HrjXJ}5
znVp?=Uu#wk)Ajm?Vj7FlmshrPC&O3-e=xJyK79OwMEBe|(YQPU{-G#^#Xwnk!^Blx
z`aVYqRkayg3F=_-y@2D1C`?BO?jQr}y&yAI?#C=ipP&pgcti4my%q#FVxy$deV%UM
zt+8Z}c$tET{*D1FI)}10dO~mw#;7d4FD}BH2tY)k<E4I?s+!p~T0&KVeR`%$@yDLb
zFlEu}54h!8ys+1F=-1Squ#7%uH|Uj1msA^z`k|L(qf4jtHuhUP=ww*+RLFYTwX5HH
zOIuxCRSBx8M2}(41r@9I2f1Rm5oVTUOF>EE>8B5!3_~(=%Qca+p^Vm%ITN*#&R-4O
z>gwyydb5u%8J=#RsbVGgE+bw1T%-BX(`**L$6d)5Oga%jVkOltCX2Wz0s`c$@4=mr
zA4+C7%AHl{=giDC%bqOuS_|j7Y9e1<?7#nP!1+qA9K>xoMX;ax<!8iNP+0=0@+Lt2
zW@Xh|LKj2E$E?0WbNKQH$81GVOaH_^v7gi)HI_<HD4#DWomQ~>3gyh?qT1rCZm-bz
z|I=mp_o?MPE^Yx_R6yB!{p==q6WKe(T+`7sj_Y}aI#Pq&GFh%0@kXJni|3m)!>F$>
zEW6A5?~f*fO4F1iyOl6)rQ=)sdAMp}pbEjSQubXmPNIQmVC$HpA|y}UgvSxMWkZ<$
ztL2s0pOp57UB<sE830{uVy2}<bq{C)nS(8-n*jMj431&@w5Fh4&DT_4pGL52nZ4YR
zqUWAvU^zv!ngti@G{(WV#E9+6{;9M>OLnrbdwc&B^nc8?|6AzO>H!wGgyelsP#^QA
zwq8T*BWl<yi!kmZb0s%^Hp%8e=F{~`_O}F8Q6nBmCo@8pU~7E{1{m$LMQxl(%h6xD
zPuA&0e<Zcbq&2UoPQu_gBNUc=dIlOj7GzDH*f<En0TFUy)t$OEA;++&?|~*(PMI(7
zIST18-;Kb^(cPR3IbSBTs6txxB6%KD=0F7XOp!P<TJjvJo^?0|mLso9nJS0BYSKyf
zY&c`%d`{$azGcUuWx2W#HYUtA{#N)-7*!{`>UP`9WG;(FJJt7mo!srolaUTW*gDcN
zv$f<LFhe~4smvow+=v~>3z<7?Kt8!qlT;;`m*%NySYLiP!?RE+7$cV_>G`S^1P*4H
z2>;mOfy-uKSooJx_T&CtQ}e<JHlOSal-Mm6KjFc$?>iU#psdaPa0IjO*Rtz&(5#^v
zoMHiCRx$2vDa}iX=`q?}T=CVqD77zAcXA-u`@vLgy4KyBBljqW@^C>kOfJb6uAkjW
zW>AG0Bp40^b!J_o64(8`*UP%yuht5acX4oK3VlcUK|=f%-z8|^giNSOLi%g#oq$&v
zJ?R_>KYzui!(Qwsr_M6bxfDF>^$C9rs9^>+$Yk_D$ci-QjCdN}sN`^Yz@@5CpW61p
zP3^kUGE8GB*FIoqEc}x|LEAQZpx>onMtd)-N`fK{8kFOQ2<W2Q_X#BViYNx2tfvGK
z|07kDA~l3`Q}WU){tST{98@enq29nAiBE*;%REmAlS^DOd$MnH<S!g`_m&%f9!{ft
zptz8sE%XE4%~vr9V03b0{py>J6}npCU<2n0J=xpdTS~Ub%r0P5on=ruzaOv#?$BFN
z$u;EDvCH=%#kA^KzKpGm;U5ASYg0H2HxS{jKW<iw;ic%*lj{@YecBA~@ldEdDdOs8
zq>{<XUp+pq!=2bqiIH5+PbvocKI$>Ej$Ce;@#aCbA{O{4;5mHUe9k3Deb$$`<{ju(
zA2Ls{)*kKjRV&e+F9&QB{RRDLRU(9`-LQJC^ShL1GqZ;t-R%FxzkDA-CU3oK2kV$G
z7R#$04K*b3`@1Cf>hN6pzM-yrU+)M`n7yI8GdkjtdAygLRwwrC0>ns500BJwJnYz9
zhAVIS{P5lFqdd^op@MWa@Lq6vzIe9va1sx4PW?c!&IV1kcsYV_{;9vr*YahSkV7&u
zS>Djg*E=QIu4eLh4$6Pvr;U|M){~T()cbWhttxyE2BOVuBF+=sVS%A@y>(xRe$eI9
z;R5HDN%9<CN=ZR6o>Z$EVkA?MKXk~S=Z^M!vjxOD;dO-p=*mwXJqIZ;)%M?xS^+01
z%D-lk$!mEE-4qeRXzFAOwX*4*h@i`(yz;E=Uk1V*`7F@>Z`rB;EIKnNO;^7xy(a_>
z5$iUZTy-}|>biNUp}<~9?iIuxL>Q}N!}~V@t%1=QEc#Vn#5aVKCrq$3uEm1O`>*JQ
zrITYW8&_VpW8RKAW3+ak(H?%;*zA>xM~g`q=agaJBRR4BuZ2TRs%R|(LhIxn^FOk6
z##{^h-(ptQ&4+F63mhoyy_&U*$Dpu;b}hqssmyS)Bb3o<4e!WfZ$ByEER}5kAQJuV
z;D<$!(<O*Tk=!fM2ZOTL#a)D?;ybjMoF1{h%t%g<Ux!Mc&?`sucAB(Z%q{-%ob%qt
zNT$Q-oqv-A*o#kIzMU3(l&>j^knPg&Q+0H3M4<O2%XcvaueP%<@~ZlWxK=B$ad+n1
z<Mi^6b#+a>t|z^wSJijmxf*%BKvxa&Kbe3JT*jV(y}Li@RT*oqGZtegT6gyPCphcT
znG^;H`+IWw4g&`(a2?acL^XFzbQ$~QJeiwCp*B7Q=>MLFB<7?OR(E#Vas;YB5~G>J
z?1$J){NKy`&(MhyvaF*$bM3jkC_~{#`IhqT6dg!rvT;6XRrcCCzVmj(>z!>sJkQ_C
z@-iu1Eiq6rg_@JrXBtgHl}L4Z%5s*MSCE@4Fj?A!@P@~bBe2G|X!P7UI`UKmbxm2;
zd#7V@3&LULL_ql%?H9%*X)AdF`AO5(|8O2F7l6jJ)Hk0T&+en}_-EK-8JC$x?|pC1
zi>6*i<))wCKH<4T$IH0nm4;gl30x3`v?~l2x6gRFg|C-@!ntGjdY3YseSvUHgX@Zq
z_K2HY_MnCT@zO#^2YCFiTi3P($YwrO|C#siMhk)ApHIs!7vl%>f9Wg_brY{CTCvu;
zZVqgrOc&s*w^JEmn!+<a5&3o(A^*V5$l7r7jeV2C(BxAiC25g><%+Y0>a^+yYqJXe
z$39BrOs@gOTCc1vB2k+l0q<U+ciGT;HGK&za35K)K5n}hEH+4}g5LW~HzPe+CME$?
zyla*iTt6K5q#EaD44792<&CB5<sehA2fpS{!_mkj8-LGQjvmW^u_cD}N~~)3+lF~?
z!jYx8u7sDzF&r`gSvQj~uK1uK7dGTY%;4(wPkoktpqHGH+hGYZOsnqlKzJ}UZg48O
zqcf1_j>PZ#MeiuYPyepI;o2DU5HsqPnt`g3kwMQ2zA-Z}Ez7a;HYZFCG0c!{jKQ2U
z+Y;bnB{F&VV`ABc!jM%Lc8hX(f;Ju)OPl4*Hs9WNIF6z6Qp|q8eR7h&{-?5ZMf5WG
zC$C+xFuid?0l_t1pAM`WAX!aJew2tB({k!pMP%gHxlV3+C?1Jz`qGzSKGMP8`Z3gT
zTeZAX8hBG3<(X>gfMNCNR@_Z7+-LarPH7uVHOZR7)vZ^sc_Y_)+IGFD<XDR^nFUp@
zK~H8cK=`}Z^_^IjyNZER&&Vozy|VBE;Q@ErM|tXfKnHL1nDg7V9OzqhuY4Y8mzH#!
zH)uYuJLCt<gTybxU%{(sCVBZ0zf^K5M2wo6vXxN*mYAXc3ipH7^v9h^!rzRlLRX)*
zgKoI)cKRUZambw7Dy0%mFA}vE<+82x@7FInM`u#9gys)SjuA5ACo9v5HRh5?KF7gq
zrbPdROjUi3=jxSpZjQCF#+1W7=hjs0F(bRMb)G1tar!*RpS%m3xp$Rcu-v+sUVPQS
zP%IGDO4<KQN5fcAu<ar-lVw70Wj6!@^U|){t}GW&xdNy>J`lrrJ;4W3?*!z;&4w92
z_w-z(@_F9XXt-(Q9vcE{RURqB_w>m+Tm-JJwC|pXnzAo7Y_9@yi$NmB&ZOJ5vyThH
z-m|l|X3rj$0VX!UsM3f9F*gM~E(6t986@^t&87Zq?PS6yz}FuLJ<Sb(rL*O?)&R^>
zKI9ytx9tIz=^Yer!l~<kmdm#1cQ8*zp8?4AQvygf1RVdpv)?gq?}75Up?|daNt^zY
z;gA{5e?wpC`YFZG`yZ9Gg_3q~Jxyn`M$)NhiahN@p-bCqZ=f?GeU4K)CFZQOAXoq}
zdBDT)U*nqlM*YZPQFW!@tNE~?Pc&x}^fVEl3pqq0GS1cgnm-r|UB=a_(pHl(kAJ;x
zuDq6aKK4nA?NLNK&2sh3Kc$$OfX;g`m=;4&au>G3pR^wFRojK-L*Ud{z|H$`(;QaT
zw^%tY1GI6nS5|*K(N{j;cqzHB^0^w34A$SNlEC~OCE~-`W4ZDvLR@pnNVBL8@*?_w
zER7@}3o!(cYGwjF0~7~xRMbJ442h(4PI?x8k1Fp{1mp_&^cx4`sxM%bJOBdW{>|b$
z*9;5?6=J_@@)Ia82OU0>dA3li$)EI*;ttJs=wZgQ`6SH?y4EZ7elD9q9_2}H@;BpG
z+!ajn-w7Eof^Wu$%?j$=Qo#{B$oiuU!jJy^Mp65KE-j?as2l^cg)QOwe+T*AYPZY6
zOHEb`c}&-P4~qhuCU`mwJ%IyUqj$?9DmGdSw883$UmZ6#Ok%&x(T}|PTUnjITTe6k
zl1PX=-Cgmuix5xZiGOW#oxi`gw2^4kZ+ImV_h!ClqV;7+RC+oG@w8H~aTINi;;(z2
zGB3Jk`r{@<*MUV9ifx#%F9I%lKv3GpuzbU%k(@!j;(Q*DC(#VswQ*otQrzoPaFt$5
zC@-S`1?4#k3_-4~STT*7NbK<w9B;{RFsVY}>A3btt%jqbN+_I~D6?Qhy-<|TPBG#v
zS7M2q3sA7YdJUINPDZF-YNduaV8e_kAUp{smwLTkDeuhg3Z3JdU%wT7D2y6=bU$R^
z-HnX12>Xb)-}V=HWH#lVH}r7_C4a7r3n9mRdS|L4`A)Pj@+x;--$#ZIh7<`hn4GHY
zSNamN28?4F-4SXpCS40D-%Sd93gUBqtJ6=P!$Y5xV(%20w)trc8a0*n;4Y7+GpDL(
zTH;~WPaeMCaR|5JgjA_6J^?lo?3Lj)imsC4NZPA3M3&E^_S>brZf(z)R@s-xIgzAf
zw|ZSXA|W;0@|$_E@Sk?D!0`l%ia7lbOyx4#n6+>JfBd6*YNKhhQWdkl`}-g9!#j1j
zzkDB1xA9<M6Gb587JtlTe9eBszR*edNUALT9p1c7Zf<BkJBVVd;SXgTo<H#h(g`~b
zt($o2|Lx&(H3UZbQVyU-sxU2e!_`6_CMd+HDjkR=xA*5IHI_rJ`>oU?LTnp=;cd3j
zi%CWFY^+kSuX1X3wkhDe%^$PAe$UNaL6Kl);A#BKInnfVdiDex@Nb`?=3(#0J~W+d
zGM5^~eBP`2IgxrXmQ5})W6j4qY->4cAr^{U)M~_yYiyQ=EUcyhkiTjgcF;gR3V9W7
zgIljod~Qi;(Kj<ec|Jys(5NY}$z;Xa*EOw+SB*?$WrK(HQ=%F%6XlPrN_|&EE939$
zYqm_<AJ`w*+#Gd|%S_K-JyOsvkf1zb1Y8VI`BOHU@JGDDrWdj2c^(nSe*w;U!V~)3
z)CrSlREM!a#<ai42r?<jLrm<xKqlj!r)c3_<v|2`t3FHJ+Fj{%SV=0jV-?f`zjXPW
zD!$>-;Lj;&8j^bH14op75qgDe$$1%Wnq-4CdfZI(A^0I)+QPm~eO}?cnpIGgkO4}J
zGcOCbTg5tIe=o~|ZC(?K8a(JGzbgAZJly;SprRsulQk}y{J>F@533Gp>+e|*BVZEx
z1(6Ytc6MhtsM>dNGD+6kMe7+J?Kve$CZXd$hk^IP*cYn)gTtc_NX{U1HhbB6Taf@}
zXTIWUuuGZeI@WPV9>&Mq#`*L7Yg5__Kg$dt3?gq_c?1>bMOsR=79?0j@p1XiqC18v
z9A*p#;gA>GPs)~-qVP#8H+<hvErDCcIL;c}pRiY94domsUogeV3V)gPp%lM?BBOBV
zSMlquMXJMp9VstLy)_fCdEJa*u0J@wB}@tbb!`rb@+Q0=l4BSWjz-*9n`@FN5|4+y
z?iG&NOt~isgV6t`dK>Y2ZPnr^gkneO#6CBF5}2SFWd`ZvAI-J?5wyJLhUL~t=*##V
zp^TV9#2_wE7i+x!G=|$Nz8Z8pl0UUVUf#|3xaYt=^r6~^49Z`RcGwC(f1Zk<c-Q(^
zr1JY#O~WI@2SKuw09<4x<Gvu<&8T>CwB8*BgYW)`e{aCTm|9mj(YnwydkccVxpA~}
zf|w8}e93>-p-J&rWvbj+8R3X=vxe_3`a*d;$8+8+R@x!%y|5o0{TtTfs{Tb4zTVa-
z&?!5hFQi71PrAm~WBIyFi{z-J`so141|rjr2<GJ?oL5%Qck;%--Gy4En;>s{7jA;v
zKRzx38s8RqYqSK9#PhU{TAugb%?2?d0e}Xe4*7@|`0Kku)_wBAQqL&yMJdh%CA^5j
zAI4HvepR-u3n{)$m?*vtt?2X4!XBj8CtR6M(e(24e7<};><DNl^)KUciv!n&4rkB)
znL1!0^|P0`RNctUhgWckojxpL!%1-rF%N9*{r}jO9`9JZn8ebRzLugDdyZ2r$NCU{
zzE;6yI%PAqU$_U!;Z3|;F>^td$=lKC2LIJUn-3jo_3A+pk;;XL;+3CdMv5lNv+n33
z*XVHBrUy5)>s~VXvfmm2o3+lwpFUhxPs+ZBmSfCf<Fp9{yfig}eG47dLN=U_sdhB@
zN$!+^jAnQbNWT3?o79tL)j*v6wt<FuQeJ}8(F!Bgac$Y5bjgv8`E~pkE{BrgAMB=h
z@`u6kjRK3QnW^SlAjTD8vId#ry}8`V1vIb&eY-&BHBOESAI7ztzR48IV0z5UB}cG#
zsAFEx;4k#5li#iW{hgy<r`E(F+HS?i4f_}7W<Ky4kO(*5=DxPjL?-3A`gnW$de+LU
zj_&X8`?p*S%&4lP9!piozS}W>neMyKHJLEUfPdnWd58!a+N&3c*B*v^O#;TS!A0MV
zoV=QhL4sE*&6}3l^iQqsuD!Z}dL_kaO~bbJICQlLRE%5YVR2dFL=u`JRO;0v#vSR>
z<kj0{u(>Sm{qDA@G5#Y-cFVcEBK+(!gudg&qs*9laN)jotw?4`i(y92I-~W}LBRn<
zUi~hftAOY{dw54dCe9b};=;X6otp9~6K&h?zSlU6`ad&33`Yu&`yC&?#%Wx&ba)a{
z{8NiQ!$8MYrwn2>{T_wY*MSvT4c?8no=>8SVy7T8oQ-UWoDbykL4HaL!JO=ce+%8)
z)hnfr(S&zLcSJcc!*cHGs;RKHmOZ=uT~dPdq^eG%`rcp4S|<~+BiUn$57$ewS`kAp
z?UGp_ZMwtujcoTyi=m4NBh^;s|GK?NZGRV>{?RNfQB6D@_z>O4V5p-nUt;p_hYeSx
znEP1L0nY`~;rEe0jM57uc|TGAlpY(!3Cn#CEQA9?Sn;X0zTcSk-Xo^8{9b`q*g8yN
z-TUIhnef{Fs?~b~k9az3mqAH<eR+s_tbJJ>XitUun^&7fC!EM+`+NJKMi2dZS(r0A
z5>A?lz+t}yzPPr+EWs@l59UX&<J~2YwryyBC48G;1G=z3Zy@&+dFmOho-`ZS(qQwX
zFE|4Haz3MViIcFz)lCnw$>i$Mlh0XG3R(^W7!2B6Su&pwqo}B$P^dVi)Zh1SJbm!Q
zKZW3xc85ml-A%Z;1(NrL@U=_tU!;L!$!u3co%LyXu~yfVN=$*JRKJ`g4TpStAGNJD
zg`>Gt&ax%Li;MJ^1|n=o=@d<n8V>(LHV+0$>c<yy173R1-7)HnS*C!fhjB2<8@(`6
zxur$}&kZ>V6qespM<~j&?M7KZMOM2;M?sXcAVS{IsVvFpdnqszMct@xCh8_a51u^}
z*q!gV$n{S)I@^Eue-1(+veDMtU#bC5L!(zN`LwoNF6y-g68ifITJ39f812-A=w5@X
zprJ~1hb*l?Ar8m5v$_K_Xx<+$=u2psuNnQbWug!J8mK;8*>CX_m?>$hvIYs3?B*?s
zKf2dVa+Wv*`$#*-fM4|gC~dScpqBuXS<uHrMf6Tr6q_HO_fI8AR~4|sT+NegQq?+h
zsL5aMZ!jnxt@{4*+E-)@3cV4fzF><O&%m2#KC!Er4LCa{_|3NX9G-QM8HU||)$Jh<
z|8{N8zxgQ#uGOiqYNx9Hj^=gwdTO}?KVpW~75^GKtQRU~7h6ys#eaab&;XNvjA>3e
z&bH5yE4a9uJ@`-{$s1P>K#Y=u1Up3z32e?$gt}V3U46@ch_&TMvhGcF`WL`kh@^0l
zi?P*fHcv8|7xrEQe+b{mtns;(_hWK1{}LYXMc;sz$zeDZ1VN;5WxoJJ;=8iO(8%v9
z=A^pJa7*Y6ej>WxCthN`vh<G36&}Yv3mQYrAFN<tYfK3)jtX?sF-H^2X$7L8{I@0x
z1Oh!ks9I{4-(WO@4tBMkowq@#r0+}B5zQnrRh|*nXKU+1xSBG)=zp`dXNB|hLCfBB
zz`Dq3YuK$O9ovwQ4<jlyA`>DisDaskUkoT1)tC(}N&QmmlUfW;x%6q*68RfSAq`IW
zuon_(Wo-RS)K0XLCZ)yS{A5@lRn7+7uS~_LCw^TGx=K;k#-Qj~HD}*XYo^JRIOn@$
zq(KvE7HLUk)V<Nu#Tb2One+WviFh)>zWzIVuw@E{^<kmZ<jn7ryQn>C0|~^BsPgOT
z{`}`g)iLB{f){WmQsy}7J83P~m(Tbpmtj&2NkfMn1BSh7Aq~P=+Jl0X9EXS>!X*)3
z*#TF^GC&z2K^7ZnTF>!F`KZJRep6P5yQIMuw4dKT5HDw@raGB5h3gh;Tv`BX;=&hM
zg4KqK2E5$h&c)&@20?Klv1-0hu}7a8g}jUjXwHkvMjGW_hA^Tp)<M{qnKPd53iH;M
z;*(+H=b^{TOHr;Z5*P1}b7qQl%^`J`ChQjk43q4<eSOmkE-|v?eUG4Z8M!%WPr-Mc
z4hCRMJ`s-6lzNOTw-Mj>SdJq6I%sqy%Nc!$G75~_)-Be#Jh0DtRowCrmuoV6uJjiv
zXkcttKT_SIRNjR;ldBQ+JS2oNe?Fu9*Fa@0ZtA<Vh<U<Ur;_d0UJG*7xtlvhyhai3
ztmN0gz<wW_NWCyf-4IB!S7wi~9?hgo7wuty|7$JnH#4OgYSxsI`R{YKd|=vgdJ24*
zdZ@BGF7G6PQtRvkJpR9%td|d%z`cFF&vv6Knva)H?c~$@RO#EoJ<;q8<Oc$=e%VbT
zp{m=08Q@0YJJ(He7oSeK6uj&O)15R=A5z5o%7au9KN_UlBEvfr9T^xR3sJG0T^}I7
zBiQ8#jpfo;L+AR(57yQ|r(#GsF_RP*r#&p$#Xi8joXh;buR>mi@q<d||G-+D2N5Mh
z$HRH$S%+vf4y>I)zUINAeOolJmNCuSrNykg8nH<Kkdd1pukU-G(1_x{Onb6-G-6Ux
zW^AP9Fi_qzVFLXEY$wNpiFgoYwn??Ex+%PFf{L9%IcTj<cRb)3H@^W(O?A-Zzxq4-
zh$-Mu<b1b<%Noi9S6-8U{PI!v)ODU56x;+QRGmwS(ds0gh0Z!Pwi!7vy;XmdP?K>T
z0&Sxsjb{T$BI4_M{ajNft1h}2`Xc`>Pl)T3+~Xuu%M8`~#R3AiVlz<A$YiGn%Z-<K
zMr@7X1ZljNM0<^H+e-Y~6~^gmqAmnNXoTM1MG|pn8fB)M@psPslSPkix!vi1gAI78
zcE)yoFne48OgB09shU5Gd2(Ubq)k=&N;=DOPshT7*e?zu*-ddRL1{9HlzeiAhSa*>
z;?-ov9?NiE*s*q^GE6L1r{(?|`16y#;X5ZwiteYSG$V3s(St#Y?2M46V`@3PJq^*Y
zt38nx<aQE%Po#-XraQ{sLPlgJPw7aWgcToqc#=MItMnQpa5hp7Vp38~9dxz!Y)jSN
z2t<wHr-6f?!8K5Q5Ywr6BB3Q}Dou_rlXBCkTWb7PNZMLnz79$9^7r*Fo-!%=b9v27
zIDNk2a3)X3!N!u<BwAyl&vx*vBg%7y3>N9@lWcx1)<(|09s$XCjN84s4;`-TC;s0s
zUk}^rl&MQi?41X0(agp0ZBA7RI+!=nX<e_XebN?=`?Y!uWMadwBiBJNXidd_YPYhq
zF9^aD#_UmH(WDnhSP+x-@bhyzj8N+WPvWpElCFePyTB&PYFquOAukWNye=`=|EShy
zRO<tSeHp#ERWq7*IPkeP4K$r92K*jB^1>kBUvP`L&4u>+eensrq&D*W>s!mB#Qr$s
zuP)l_$UR|hN;!h)hlM##2&A@qEp?p2**AcCugwSX%$W0zj>@g@?Wlm=mNI*4rnAw2
zb&IRE#A!gmJF;XqxSfBrJdo}ZKp+7qeM7_SzumqHRPXX=5((H*rWspK#|aB(K;m1r
zGhSxknmO+$9^D{MZ@DX=`J24=*Op2qzYsa1w6-xxjkaKZoe_WNTv1`AQL2qGLT$Rv
zz+{y-^w_M&xZd7Q_A-a|>Oy#l@^QE(IJhf+`+4jRfMpfcS-lEu{S|%~6Pm?Lj{l(Z
z!rgCCKR9|4ykR$cT)~p%Xt_#THKOlhvdw_kCi(1*tUFR%TSet%8;<>w@zH-U?Z+^3
z<F`?;n>TR)5s^Sy`X<q+_e>{eCnqL`CJrP?Q)3LB6^aSnt?@=xsgrKW#oX3b`gLSo
zq8_BJHXgyQADK7SA*c1J&EaKJEXAi7R0*k`J<`T|8;yKW2lNDZ0d2$d45z@x6(Njp
zpVje18HUkX?2sCQkd&GfIsL;W_ARzG0)laq(2C;~k9vw=wi9d=zE^WW=|aG`RsZLa
z3{t%<!3@8D;do;FN8y!*iH)Qh55bTZf?mTlDR;Dm=dsnKDl<ukd54l}QyOZxin2|I
zZpLA{#8U?r5v{Mk|CIF`NzEVFmb0peK5q~^rwSkso-3U<op*li5h!+-%VT2=ztI>6
zjWqwskF9gMP|w&3c|murN^$?=t!VYq%J+W^_W@cCu1j<L`bv<%=U;&;VnMtt(k7pr
zxR?=j?`ZEqAiQY-ImMkkCM^3iF3`<-D2c{Ug@#+%9qWGIIh`DAN8!YML0MHJCUGG{
zg3YC&+3_Mh^mk9fRk=!}6fAYKsjM!8m!-OfR^-`ClDqaA$RG)#2GzADZwbTuZTW9l
zHP-oAt^ogGJ~Pncx5)mPZ14b--=-hXlNND|uHYi7z%yT88ZvR{$~nA00CwZgilX@Q
z{;V<hZ=Z^^{If-pYB@m5&pcSbE7C8Fc?556m$9g*)G=nk_#bYNzZlEQ@@d|?9hJ<k
zmklaE@cShqE&m1C8xl0I3(mg#Yk9>uGi=Xr8X+MEh5c9ko{z8#gM*Jhdl=f}zmvA7
z$4xOxL)fhDfd4j8vqaNe^~v}3AFKW?GOBDOB3rHdLup;7|Gi`5+tv#;7?+I4W$~F!
znaBvNcWL_gbk>MnyQD_V{E6YMh>L}AWd7>7dNfdMRi=z+u=lf7g60}4mu-Z~y1vfb
zhqzye%Q+N=%aOYE9__y#;bwVZ#eDMMye^;}J4d!SmJ#&RFnrs9ytjYWfy}OFZx3%^
zAkNOFQVY;{pfLgayG91hAWIJ_PsK%mxJ3XQp2oW7Gpt7o2<0p&eJ9%>lU98dvI|67
z{K*4&;<DE@GS64$Pj?9hDNwbvhXEb6K48Rfu1QZ`I$pg56M}xvPqX5(2F@;wU&7l(
z!KbKzG2Y*&)2EV|Wj<BGx+NxQ!A)1IVjzJTsFG3opdME?-*0hx2737EDYN?o!YW6+
zy^$2?XR+e9$&%R|sVqIVJ7qZAwRq3}gfJd2p`AK0(a{n7zECA%S;Rh;B`lz^wrQGp
z*s^chV#5aq@GVCUd$o3P$Yr!brZ)J%LVgG_Y&`qsqm$;jD|UzA6!-0#PO4$t`<EMU
zzVH$=d1vi2)eNM<-Y9%5ev3S;QX=2?UwY=>C8r$`!@(C)JMJ3O@SFM93|aV&+mG@-
zu9J>{kV#hiwcw>ENg4Rt6<HZ8J&nRr?<bu(@fkHngK45Vgp9}!D6#Ht->iJS5|3u$
zeGrSN!#Miz)38)Jd>HVWtK6Hw8Bpq(VmzIxZe1?Up;PSYuYOng>H>T8UU`o4MBa}=
z(W=O_I*Z1YNdxB68mycsh!;P3>#w%Rb<z+p*m6ejN8kU+*4`HPRt?VaP_>5Awx;{h
z6l6)u<DGh~?i{S<8(fElS1QU8?MmvN;<HK~UwW*=0Og~Vx!3Y&!#x&Zm@1S1RgyZ$
z#F)iQc~_c(S100&D>7M%(6!+$lNKLktLJiy_x@uZY62fPYcv-gMHJgUSc*|mPyc%v
z*a5}nY#E$QR9T$LoXvDN>|qX0k6QFZBWs#mUu@-?BY?rD|6lLZS}i5TAKZTvnnAVD
zg0qKWW)fQJX6O7H*^NUi8KkjqWNVX8n=x+{6kjbb?4;B)>nq&MCU_}sbWf(Ajxq3r
z{Sjv_!f+0fo>?@VuJzR{)RI@TkBcVRYOp>k;UNX9(GdQ;axS&6nBqylkrdN12(@E=
z9s(hH;Ad^b2v12~-I?DxoA;NAee`RL$ROfuA?S6wyA=+qwEd&#C$Lp&C7T}B^uPb~
z6xjI#sKWX7*+1V3FEwSOshE!S^pbMoGI|qNsLs<h*Wnznul<ked>~*|`~2SLPJyqq
z9H~P&!j=y27-2fDJ_|CZtbUwWW*r@wbG8o`<}SZ&jX9)>QB(VQA5qb+$DzrjHnGM5
zznb<j8*&*~S>u!oR|=zgZyu*b@l{Dv=pY{7WQGL}p?mp1Lp{-Lw;@hLU&W!GlA#}b
zcgbg(cOfj-;#w+&*MX6;se9k$G&f*<{>|<FzIt%M1i~u$`}yL2AJLaUBKe*T%pT2v
z18X#0yP~3eARB)_0F08S<c8tZ=&!@>83p_9Stg0iT&mIhYPYV=qm2!o8xm~BiH9QQ
zO(%=P+K1$hWDCFj?d@+Fu`%g)iJ4qsK=-BEp--rCj^+g2YYj%VuJdm4HwO3$N7~DZ
zf*5A5vfYr|W!A6!@~UOl18J`X1p&PJtk|B0MF*$DL*R~R=3+!*uQB^>&v-2?3-y^e
z_<4I%2F?MO;ls{EJrgR@tO~-EF|D;P$R|Y(y*jL|lv!{Lva|h;23M%&HOpegqu~x^
zKwsqAQXJ6>W<ah5N(bY;((*0Y!!z%$g<rXpAu~hbhGRyCk8i<QA5}?p9W1=RPLh8`
zY*IB#QZvjTVdp{dbXg~(RLIc(2odGrP;qh2w=?-u%uOKmhHulgkPm+@1&K9Znn0*M
z|K)1D+;`(L*Zub+XmtZky8+xHR$Q&$2Y>$}MnJPS&r0mQ2ei$^1eg(MsUgT@$Qxre
zNY$P1q}vy0#zVLKR>I9wG=;G~ys)t;BMqrx5oh{9EYm#)L&EeB16Z~7O?n`moQ!QH
zxJ3)fZH(!?9=02eH@<<bwCrS`^PaC|pWd5nr=?(46sCc+ZqQ}&8cGERECi0P?Cm&k
ze$1sad@}m;j!}cbC`{msGun<x$3?(qZ_@WpvP)lM*~CxZkWzRdJP0MNMG+_-%zeSz
z-`~iPdbiEgs@B-@KRbsWJz|Ctsq+Yu{%1FhqSfm$wnTZ}etP>++Z*972*<VJ>jusU
z+C-<HC%R0zzpSodOKJNjj~whZA1|6g43q{@!Zt17N5G?z!ZulV@apdz?vcUg+AN$g
z$!_|xCFjKj_sb2_9j6^>nhPgqTYmR27}{ToVa=MwjkA8hxo6&3>%a3ii8HZ&*~T2Z
z>8UWIq>wpC8rlrlz3aa`S|YRfL5&4u`*kI?xowwi{a+uH!@LaTvhTn(#47z_+w%<U
zj{o`AlegzrS0e&bxsxShts~|YGP$_2pAwkldf=t?%YP$6(DFC5Zi>WbGUP3SV{dRU
z&I~1&&1o6^U3>LJ^g_5$#m_Pl*8fCyOU1pok@&GNLXwp^zP&vtofI}%&N8J?5JUX_
zCFp|bdfd-l%5=W#h(zho%IiVpD>2V1UdO&zyFp7C`DghVP3+HjDxKv=A&Z=!L*^n>
zG~V+;=BOpA7NvM?3D(={ix3K3trt0sskL&1P!39Y_eXRW=-%r<F81vcMe=-W$$fbV
zv;PKKj=||u#dDOt-a(i>!2y>s(W8N#s;U$S_CsF{PWNYz%1ArU&LjfxT;Sxcesgvb
zRWnOIWBdGTtlR$y^2zCn{;TgJ5bhWXezq(n1=02W0jJqGxmP*=Q>#zl)GNt*$m3pD
zN>TvWE69sv(EBC{;k;3{Znc<aLI-(5nmR(#Q!oY}@;pMM<pVm|<fg^Ul-cH?Sna|N
zd!vL#+2!#9lAZZ-ij{EQ;?yA(AHZhXrRlGgmtW}zXPuY1!9GB9t)0BcHJ{xt0Hlq<
z{>ItOdSQ%F6Dje!L{#x(l}?*1Y)?VL-##`x!dqT@e5fiUZzSKb%p2>4Od<^HK39q0
z@ubn>k%o~}mL?D(jUEx^)CSg(g7w0hGSqCq@q$UhrQLmu#oU1iel1?h)T~yXILJ$@
zKVvwLw>XIkm=`g9!0RTEb|Ojdw+4q*z5%`%#dDz+;vvCI1qyjh{+CvDc%?U<)G)o}
zt4Jpk{6K>r`kES0ocb}+%FqaPM~=J7Z?FtlD)rtvOcK5>Y4j~myPzQUqMP%NrPF%c
zgnbF+%#X*u|NHaJRtFIh`awWu^zTnOqAG2ms1}NU{O>Q5lHm3CQ<YC3WBbU6w(EK%
zqY4abG3~GRTjfM=1id0P!IJ9#>1*dJvxnhl|M_Vghx2w04gQa%^Ny$T|KIpAlcb|4
zp>T{GStn$VI3%-<m2u3-$liPA9Gj9I<&$+d2**Ci9*4>}%9e5L6DK6=cYhy`pTGQx
z<9^?-_v^Z@=M}H4mJqofNKt;+^9=kdr@r|nnh|6cck7i<;xxTJJe`3pHvuUMY4ao8
zin|NhSo6H}f!c1*Y2^+S;h_0AlA*7Mh0m6nr{a~>Aml0rT%7C^nODECdo|s02E43I
zoNh99yxsplQJq`&Pz*yDfmn&EeDIczVI$`Uef;cc&)Wr-*>YiV6X|QBqPr=1Q<b}D
zQV~u)%-0661yJ3xDVR+(xSVKg6n%tksBZr^jrrBset-q`9l)ng4!bx(Ki3`!x(amK
z6(Y<_0L;pNHI-Yx-iJ`uCU+7!NzQ;{BFCVV(tV~rYJ59$)_mDOIVkAl*Slx+N~*5L
z^?lhQmv`)CThMtWw?XeIxQA+t%3wLNB|0i^;`9#yZ5&@7e;``?!k7VO+1oU86`8f>
zY*q3TwWuy6z$edb$y3cU{6^H-XQn(oasI}7t?t*<Ykk5dYpbC{r2lHtj#!J1fG!Y&
zV#JLg7(bO`39$ESFcIZel7FP*^UiJhvnGo6kKGW}Z@viVG6xk1{XjTDsEA!)AcUk|
z>xYi_MLz}2_PVkdegF91q9usu=3ZWdI6e2EfH&{oPj})M<w?`I)mMW-wB9eQs6f;!
z;3uki=xd{(Fg6@Mnk5U&CIbVV|B5&)Bpie)s(jug|9f2sfUC%tjGzAx*Ob+kUmDMK
zxcod(L?`TMcNeIhINvDzxBJpV`D`C$Jq6L4M}1l!6d*_Yi1n=JR}0Lr$F5J|BpQRv
zN`q{d=496pMt&U&utw)xyFcBs7>LhIeZ`hb+^+@(9&c3lU8)jic>_{E>M_Yu*K7SG
zyTu_aLb<R%ywkw;=T$gbs1ew#owa8$cC9-hUG2Z)GA!{vNEM!MisIu^q&IG&pDTwc
zlytCl#Qr9*)}A+n%JoV4&b5ejZRhJCO<i+JezsiQx<zO7ZYn6aspASP7BDDQQ32p+
zM=CHm4E=>QA9s`P&c83|l7a=%xwjpEcFm1kmCFvJNempHqUi%yS9@eRw6u1304+~6
zMM6F7h|MNx65e@2cUeRj7PnR5%JKx-n}PT}s=#jqhOoXvU}{dg$mbd?SHH`V8%rvG
zzH9m}%OcUo^DEQ2fHXJ{9&eGpN%f=0%}#mJO2fk5kI}A)TK;izUV<tOoN+#nuiU+r
zUTxS;kV!cfOnC4l`}NNfrP}=1jQs!1?*}KIIWaudR=%5x)Mk`7piCc?>x>5t6$g^5
ztE)p9W~YG|qwZ&Ueqjc18;tXcSRm}C)95jxg0jnfc5?Bp<Iv+=x(SU0sb)_Sq{EHy
zNit=SUkr+L2&bB##nd(KuU7Q@Dr?<3M8j`{PcT{+6}efUcpB!OQqk2`zj1T>%v%g<
z8i{~MJr^5h`b^+>1hRNSUso|NzwZ_-gnM3v_Zh7p6w~DE2_oAw6zRk1WYIqJx37le
zmW*%IS5+MGYQ$JHI6q}$(wE>H>U;Hy$F{-Sf%B2-JrO~P8Ox%b-kj1i#Z?C@^`zy%
z>`?&4qF^jd!^_ef5%_oMTEuhL9mGB_Q?~@O!rEb|VwKLrilHCL-9Rvk_H|H>&(fm3
z+s@uzxenfkD$Q<+*VcX)1*}pee9B$1BRt`6f$BGjejrtVy^MTg4~l&KKJZ!1pSFBS
zp6shzfRWw3@kp6t^F3g23Y!WzylVXEg?&U{*i`XH@<}?~?tFx#K|cUXoNM(c8HdG6
z=#koW2D1;~FhAa2)*D{qxq}M4ZShYYagdnaJc{-d0I~A3#%|Xnm_B{>lHpgbYdOZH
zVXExbqmxvVh9dMfI{P+=Mn20fOP>4F1kr_sAJ&!X)^5wE<wh@n5iU9R9My<q;85i&
zYhLp0b#`To7jdDwBTIh$@p3=d^0(t`B<+_V{)gq2LRP1`-(NjWPm8|79<aPLRdDn5
z*4GyQ3&7a<o|ymoW5#Ao$k|2(Ih=HlSUkU=B2NaI;=G|A^P&D>XL$1ol#q>1`3@jz
zimv*~f2HWcwc1zSP;zz39dx=GVMzcv-{@TiLgkOTiqyRK9iLI$L%v7v@>hQ?EUP!Q
z$R{oTJ`Jr`I1DYw1iETm0E{`XXb8_OWjzH>hmgJf;wS&}vXKF@UN9h;-;nzs$Q4kG
zzFi$xB)z%(Sw~3@c=R0#9{DvcSl<DV>@f_S0n5GxCC(|(UH{gP|6?O0cZ)(8YA$E`
zjC`yo+A+mu`@ponD+0bwsXx3xZ1GX!#ImgxZqhk7ou{`aS=13Q*+O^T95H}0Z#3-f
z;TLxL*t=WS>|Jrjsyt*CTdojek>Xck%kwiopS`XkU!<s<H(m@`pIp21vFrnoo=2Tg
zfqVe`SI0r>`R6eXAHEWkjmJE^I`)tH8C>ELK%VmNjXsf4=J&1I$p0pwPNI{${mF3m
z29BcS;uR6$c?H$$Z+SKx|Gv<%b6o3-Tzvl+e)HjOA+}Zbb=XqSf6oMxy?<uAR+Iu5
zA9-y~kWJ1@+XlVRK{E{k-$KDyYVFN^Ewsq?L}><)ub6a8y5*gc^k%v30(IU)J~NEi
zHGSXeiDfeT3{o9`Fk^DX#xNPVCTZw8>~Pq{orwQiCM0e5S<fSDU82S>0|I_dzn0oW
zilh~`yz*TBR(iBNzXn*9r$&MHk*-$^h*z`ik5ry;J_C!m&K=mLuvGN=cF;pBK{<$k
zUjn=N{n_a^J^;gn=D$lBmge?zPU))fhu-DU=HB&ZeOj6WMw2F-w7=c&muY5H<A`Dh
z^Y>U@6zEiljE%lzxy(yXruuzJ<?qtj8D5Dhv!kG}0K@|ncT{6R>m#=RYpDg8k@K@w
z0Wdl8h}81Z`O+opP``kS50R+u1B3tVU;(qS0$`WVm0F%vNQJ-mN&Y#ulkaeOt*wWB
z<bWrTtX?DV*wqa=WinGM0xhoSHXdY)cLy|lkE)!RHWBP3`#2!lR997l7B@D^aTv@>
zes4Xwx~j!i_0K}+@#>Vy>6hiXlhRfI&#s)&7ECIkK;tBQk-i0Bgm(E>T>Em{8oS*B
z9hsSz+S7#jxnd-Z%?~MCH8H=Gbk-+PATS%r52<~_v2M#K-LeTs?2Oz=&R18B<;Tol
ziDs-ZfPMP*>GE*HZo90HyHPqe9+p{NkC~5>fXLDY3?5{Hb008Vk#SFX(i3GCOPw~j
zRy=9q;-&m|QFM`$<>1#&92!ll8k;hpK#mW)acnD>0p1GlXgj$G@)-042wNu~7tX2*
zFM3I&<r$B5;N=A1n1Dx`c%(EX(DyAi-V(YRT40y3)gt?NxP^#7&B^+AKY3yHItS47
zfWJIqDu*UG>3fj3n5X@JEARF1xzAoZ15k1(TPeEF8eafsDv(RP)!0ZM1LBdhV2)p6
z7q`Z9q1>(3Dz@?f^S3P76&f@$E(Ms;fQpMrbDBu%HxyjM@?L$F8QjHwoLmu2(f=QM
z$F)LcYHHbC<$X!(wId*0ogv?5d3#d-!_g=H;wa4R*5WF1OE{7;6F@Ez2q7i^K7-jj
z?0fE7f#do9Kpc0IblQ~|tv?)%R$r)dsI{ck#Y)#D0iF@4rm=LF)~Kt})U>d9AY0eB
zqdXZNNMbBljR9x!SPRY@$l@3UYTx?To+>a^c+y;WG;h|Lv~Dy(=o4ddv8%FtAoqtC
zwC<~n4cciH>#31;Yp$$~V>i^^YN;G{37Xf`yzc0`{{F@q@Y2HgB5&E%*Kk56lCPQV
zz%7Vzm^7a(nUK2cR#WeCPO4-c4EIN8P~`PU%apoR0LhpxB_4!)WxTYs--|P$oY^fo
zjwt~@nQI<njaOYV2ZSS%Hu3{2;uLjJuy9d&J{wWL_eH!H<xr>b(C&Tiji#b!`fc|>
znB?Tl8aW*(x|Th0OYMHY&egh`o)!M^z`;O4)GY;CX_o8}yCKez7?r--G&A|@JU?kb
z82V14`A+Es0+aLm4aI%tX;0H}VLY}L<N`gcd1txiAUvF_(-2ye{AJxz)EF)EjWX6X
z<FS%&$lEvaC>5Rbu(D=#IlI-^b;RO|`pWZ9I?n8U;9st<!Cim>_yqWjoJX4j$@cYY
z1c0n;cvJ3f;hMt+!sEBS_GJcU)Y-YGme;T%-GBNG@=VKLFRHBno+>On@;#Y5Kd3OM
z_V7rnAySta7^JT8v#QZP4e5eQ(Hl&VEcHt%vMGm*v2t0`)%QX5eHb958*mbyHs{{K
z`S>O<=rCSi1_LDp0kFRDJpj@^fZaJfY3Ke&)wI{LhS*dTwLV`A@#r}Hb-dJl{O84@
zh^f!EDXq{7gAG*R9nes+(W2ZNzH`z}{#H&c^A3acblT2fc}HD-&uhibqf3>nRCmZ*
ziM#A^TO@g_J`?d1lO?obUXhb)VNW;P^yY7hlrL{%!gSL8;u@vF8kXy~aa6&4B1V-}
zxD}_FZy0qrz(5jCf|$EJ;{IqXAoWo|N5C_Z@l)~Sz=*=jwHinQwXfG}nDImObva|m
zt*3M!sIF6w1%&s%ncBxMz~OWzy<g`;w-f-&n2goz{6a?uFzUpUL$lEO<INI$U+9c4
zx%(BAR&C)a>|X2t=G?8#uME!~oK={LtAtVLSr+hNjcc<_$=!=eaqOV_j0aySEo&13
zq<jrI6gx5BO^~zyJ@O1D_>-B#7iwnp{ae6F8(&03gv$G`TF-Ul3iUBR%-LAz)=|oF
zHgPT_N{{h2^*^UKoY69Z|5`-0S`LwogyA-k%;{H}><!UsK)uQhF~A%2Ppmi1=<~!w
zy>Thg<_6;_^ZVBu%&~kpA2Eo-Nbw6m2fFw4M}sv1-qcL~u_5_stG~9(70TP!PVV=5
z>GxfZE&Uqzt{wV+>4#J;P2?-=B*N|wEVl5hx$rF6`oa&C3^XT!c|U!eK-`yd7g1Y@
zxAYwS`(j>>ZPk%jv8{A_b@Ywsu#wvDFc}`i@*0<1WAOmd<Z28x=vnf7K-mApW&*{|
z)Pv#6gKZzE+{<SNqcZMYOdyOZWcH!p%o>jtEunZa-$ysTO>!eIK>xY+Xr#nf(!`ru
ziN<^f`=~mhgrfqmJp+suR7f?|b{$+N@e-s5`zU$#mQ69{Z-Yw8tjhkqi^Yqxg>zT5
z5O)B*Q*S!^5A@(n1K|fUQcF-+L+1%h+=zwkDcsxc+&9nleR7QED;72c4^PDlS{}xm
zMkS1=FdB2c`+XqHCTQmh)n$O<Ho;CU|MCU*i84l*z^I*4Br_P(Xmf|o?MG8RJO0|U
zN5&5F0Eeb<EeP=$S=xM0p>t$g`#s;w;gc({lj}}{!dNf2NzKRZ58s~*i{A%I*Z#JX
z4L1=9DO_(Pzl=yQ!%F<j>^Dn-{9++vM}RT>05IFxHmW!F8d>ZY(4`x>M@|>N*iS5G
zrS|P1<hCr;d4vsvyS#oBSKWIV4l%b7D=sUemhed${H3g6iH^@E46cO&6(2xvt7~|8
zcmWAoJaM!<xm2@esqMG&ul*~%swgGA(eSl{nF!mzr6t?k5};(y$DKA(b7x1!+JJPt
zx3@HxrP35}bnu`ssnhW(O`_C0oDqqZ>b;gjRr_imaOL8>Qo3ky-vMfAR>vcGna>$y
zL`w?ePrbRv8EVX~CM8hzWum!$#oM`(v1o|N75aLn!I@eIv1;zB!rWc;xND<w4d?Ol
z9p`<jU%jn{ekTS?#OX{HDsE%z*cH2}cn5Let&)LDs;|cKYxueriPB#k?v6V(+v!dB
zSrw1SM}8YByTWUB@@Ysv;aeQ6iF{5A@<pnI?acs&t6rcxhWN5Dtfi6^K%NgH%`61%
zZ2~=pDj>{J!`&44dYBdx!oOAmns0ShCixvG&sth9%+BX3ElM!+#!-%a0|K%DfR(r~
zX+s3kIu{rgcE{cbIBaE;kW)9lw>W|Tn06J`%N{bjb2$3FMFhEUEzDZ`wfX=2T0sBA
zG!Mcx8Ojm_QTfAL{O9zvt;*GvEv><r;$Jlj$a%h_T{FahpOLYwhUkxzf?WbHjrT?+
zDYasmQ~wEvWo_q{;jpfsM}~srNdPFF;(!14E#^+b=wNfro-s>uf0Ni&Jz<su2q%5E
z3Irb2xhtU0#EdsopKwNouVvQW)I~oxT65=0A1F$=d=kEH-S>C$+l^-n0npbT^UBY4
zk-rPfG9BoOfR}<_*!wq<+bQoj8y%3q4Cv<d3vW#<#{HUoZXwBi2`Kc^fFlw{6(pZb
zTaq+LLCo`J+n>_OZgR0@^Kxqs4Q`|-!hW22fLK4sk_c49va2FvO_L_-jE;dNpjeym
zCn9dhn~6B7w)7m)Zxxw#R-q-*)Be(ogsa(ua*oafo>xp?tG4>R+G-S#qHa;mTX8_%
z^E@nK^TO>!Lh{ah${T?XV}i}kj34o6@Ygk=M73Uv{d+i(Tv^&>LO~DkXXhnGM@B!W
zO(>$QqX`HW)yatj&X89#D(L8kswlO*WR!P>Tk-b$I>%`-&6btUpQUjOkXdt_jfARq
zmI`Lb5-9LdEHw6&9HaS`?63xA;|iy3)SGbB_8-*{+oZ!g70LK8JcZN3mqcSmLdb)1
zq+f}*#NkWlfTI>z7XUiI6aWicKHNV%1aJ_?qm3Om><8p-*lIt?X#T98ipCkMQx8T(
zCuFbU!N-VP41I^*uXzRU6{YxfMe$@;sM+LA-WTUZ+)<8?pG(d>76S;HD}K_UigAH}
z?qn(>cgtJsuj9GnLmv*h?`=;FfO|<GyvMZKsw4bB*#A3~l9Eb$#L40LZuj0s;KkBT
z_uPzMuCdI5+{s2!!xQ?OJZdJ+(T6oen-%uwG1gfEq}LCvtK+&-z;J3+-)TDZcR$#t
zY{qplL1(PDp|PV<>uk`k9bJSiQs+M5_eJUE*8={*HwXH<S#{Ytt&idEIRAxWa&&Tl
zSmu|9?2Io#Z`2tVj=NM6PWk?J1ub1Uj$~P4i07q?`SJin&n^;nYj$@NGc-g%{~jTX
z7GXl4Qr6WHd;|PVUYLApJ^ub5X5r|B>2zLKcd7wXYB4wW?!;g2y-N6L4WTZ=CiKX!
zJGjh)aGUB295bM;Bm~>nvHX_lozT`hAC(hT7M%_Gwd4s(YL9ca%%%nl$y2!+%H@2d
z5{?91MUO8%t%kFdyPNRyZQu{2&-)B6dP`H)M`0y9clG{^7oPw5dcK_n31g2tJFl71
zQy-<&*6&s)L4M#S{6z7?=QR<OHN6C&S2_cshK^-dlYO6pIb7SZxb_UFt0^oQl$*dB
zvy8~Gpfbi*z1^^ou{26CV3RJ+P-oaOk5kM*w4d!NNRP-RcmF-ynaS{G3T?Jr3n~W`
z_4j$kI%e5HKp$2~SjE}i`0|D~q^o=B_u9^ugISYI(L@U={E;@yZQE@huH>?7I+xUf
zHx8(mO*S>s5k;=}DqD-0!WTP%r87ZJGG_+n$0_6t?N!=W?gT~7=Z`!Bg0(l&9n1hX
zZ^PN!oK}an_3J*!#B0yI@(ddC>vHK9SZCpN%TniGxX)KhQ&l2|0OKg|J8{NUy2MV@
z^DW8h@xp(|Yp4P&mzEwsGbN6RfM=B2vcrtpYLJU4Z*0Z`-na}fEOtHm7RUzjSd_$j
zRW9UZ9|1>Xj{;^I7gzW;=cQYMU?2hplORw44+^Mk$eajP*T4KQUIn8iLl<ewHS%I0
zMtt$k8^q8M>%N2?172G2SCj>|*yM<6YCg<?)lTXqE9B4olpnw;th48E@fe0U%I;jd
z=)ah=zNo%9S9vz04W192^^alwO<&W81$Ox<F~gGJfo@-03;$!DmweUkAbIlY(-xlV
zQAd_{S#LHRcMPwG(R@A=oN{LXPiKN+u+nPV`2S*-jE{i+E*X}(gi8UuR7TFi9Cqy9
zke;lR3;AmN5vyU=yz87(Ph@0VtS%Qqx80E%oF<54+iT8p*LMFfq6cpu^lwnX*x3Xu
zTq;$5LN3k-GUF};Buo-<8W_uuL#cqNX_S0A+40Y`<8=Q@iKn7cmN}?$V$A#<@%=$r
zk=6hg!+#wQlC%g!V|W^m=Ys}w3s-!GK%Wq1t0l9+IY6;oyiaSwjsgDY-x{MxvDU4%
znoQiJCS0x*+_e#Uetw1uDgdNcQ|mC?%tE<YixruBH#A%F>(rK?zsEOO{c0;E&^ThV
zg`g68v~#p@>aAS(zpRjf!IolTLO{QGyZuZOEBz;%Pj+QIJVn#SlLlci)5Q5;8|tUd
zE|8=&w(l)Ps^3^`pN{slvNL_toz0$TNzqM&FE~Jd<2Df?x1K)ZVTN%aoJX<`15hWX
z)hxejTQ&8w#-C|?VU5ppD45D&$a!K+O#%;A@8DQtRkhIDi@uW5J~PsreMU3N%)%xg
z|0u83_dcF9v7epLY1bFD|1GQZBYPt0+3k;4*r{qNb{cINa-A<Nf`4;|QHE7QPy7d#
z6~(`PEd?ZmUlrBur>o-OAB{I_Q=s;JEg9p-D@xXH+byfP_9gUC-M0CH#s>ohM9%EV
zF>R=06F|Jv17vD|-RE%)h0g&}e@Em6>*^^#(IWX2dj^*84qozltzgRy(75dE9p=)l
z)o#=^bkhQNi<8Y=&0G>-Y?<%v<gz71mCjX7+6hh_M{Ay(!J{U8xlqHazs|l$=^b@<
zA9!yRY^w4bgFYm#@S@l=9d@uRh`Y5Y%G@1hG>}G-t?mVhWhF{Z?n}@zD}Vj%`Y3BY
z*z_6b3i~9}2>g9iaVu~zEp(Lif_UH!K;W{(qNdQk6|eP))xrmhsw@Vx++l{Vhd9U-
zS8WCJjYON>N{W)Z7Q}I{@ZI(C)&TOpL<@(Z%zgHieDW@NVkH>gVB%KM`fvqcnjToC
z_K(+AVm6tgdPI<E6I(xPMIt|7iXcc1If4ub)biprLsXx$?*R%8K_pSRn_1&bp7Fx7
zI<5sh{Z3hZ<EkSUB(rR0axyo65=msNx-Zl`gft@g?RA8kiemqTiTBi`OM4D?^T8Rn
z-SnWVKw;-R3I=0Gd|2KnuV@QC`*-mt;&0x?)J1<pD!lQbKKu{8%m+OKC;FNlJky`v
zzf7Rm#SE*6#IP%x3@(mEn(H-8{QNeCJJ=g<$&=rZ|5<%e{P`0^CF3Jpc5RHn)Fb<9
zsqj4b!MMaHFAlU>>x4<z@B7I7k|u0XwqnMIqjo}K8RTv0U|S#|eE<aW90e39t@_MK
zB%S+lty}Y@9;U+wK6H(2#o77ut0>a}XDt}!r+CNl>OruVN`&sx*NbBnphFgh4?lBk
z<TTbW3iXdgLrd9X{P(s9$#<YwBCy!_7@v*FZHjWXssb9b`eC%pws%T5&d=S=qH`Li
zhc<JocxfMMd>@$w+tc<qSP@=v+{^zRj4#lC$*uP(!*GZ`jxg>{3zwv-^Sm;1*l~Jx
z*6l8XBpsigpMaNxn?p~ARYEQ*thLFcaUV0&j_ystI2c@p#^WsBgQt_Jll8lKz(*Hn
zq=w5(Wu>l7kzgkSwcxjO&c%etD7Cfh9CV~QI#%x!f;&z7mMRGH*)yf#h4BZ$n_Uuj
z4)5qP$<pfw<@*y9SFiZVmg3_<w<F(=_6OZoXN;o0eksM#ywmMBGWa_fPV?QrgOASB
zlQ~=4)dpTA^~l(ASi1*CFZp)<bW-Am43UQiHgnh{S@XEO*5#Vg{Xb&5jg(%dPF-_>
zlFq-?+}xDbuif5mID8OgZDa=6@%>gt{MdQ?`TX?sNAOF#sWtx%$29Nxosv1Js4oZJ
zcq?(7^u~2VF7NX}gK)?lfN{TYx@p>-MhZFyT%tgQ`u{))sQioVTM>tU|M)5+?SM2!
z%5s{aB2eX7vb=X3j3>H8bsflmo$6hf2l_w%g*Ko5rn$#_6o8WPu#W#>KC}^9j@vES
zTRQ(>!6ar~eFbb)Jp~*fc>}!H0#HDA9ZijcnI5h@P=p`yzdGdNO2^I2$N&$kV=^LU
zd=<bRVnu2H{o5)`T7K%9mt{7Y{o~H(yUJt5H{J;Mu%p>OoS1~l=rI#vb><aO8q8xS
zxQ4l<ngz0<wsLeLv^T%?53$0Y`5~RGUDBR6zK@+E5tzaODap7XMI5V~H+THGL|8h>
zA0<&xsoWq{z_b$pb#OG1mdVC!9t2l8Y~b_O(<E<Hxq9gx?gBH*Kff$}R`hzPy!+=W
z<<<u_jw$ik!CRQV_2WOUgYq&SphT@eJg>`&kimCrDVOod_Z^T#Oc7vMUuAM!mcTS4
zxI@TeK&jDnd-l@7{p-N%C0C$|*&5V|Z(!Q}JW8|g!9Zd$N8&0NOlyJ4uvEpx>?
zBcB_9-qe4<iSh<<st~a^cR^kV4R-EW^k1NPD&1v`@^`=YwV|JS*7Fib*#;(qlq`3@
zvgZb-q3>Il4{4(757F=~x|A*>nHNS9EN{Fet^6VJpj8X|4oD87VJvSjnE1#1M}pXg
z%)Go)Yq8d?qm-7VS3JdKEkFOZZ9ImkF`fJNI)U(0#1<3|;vR!Av<`f=-<Oqz25HD#
zUw#<Q@5r*J)#=4CA9gtE0|9Uo0pp(-hHL$u^?#Y^m<?l!T%mHs*$@yKD3x@s6#=^G
zNM8)c#@8pG>FmTlWbRJ8wDzGyIM-W#I{^3aah<1KL{9LI^5`ZWS#8SC75C-w#rbod
zrSJ~!epty$#~qNR(}ekzbfwA4qq}N<fxaPNt9u-J{`TVcJzS-_mO-I%+r?T|e`%_n
zAT}P&*L6A<;@l@w!rwXKKs6J;z0T&v`qTr*!mEQ0ol&Dg$5T;FiV3=am^uY>eYB|L
zz%f@r@nv*8)Qnz}X$%VK2wMH+b6fdYnXdtG)@v0~|4AKohZUhqxS9_#4XYgddTV56
zaFfNXp#3)CNz8j0XAXIC65*Rsp0vrL>cAJKNiI+vY|Ra0JLjk87{>z#r?tK0gMbA2
zrITQJQax2lM>XhROMWVJcABltHgdSdvg9PliW#*J*tQqWcNWxoZ8^Z{Py%Itr*R=X
z`1BLzWG;O68m|Tju+<6|pAtRJh}M6vL@cFV{7t<$xHzp2XbPbmm@Z2wh6h)<0;K}D
z7c9pB5yAVe^2zoNfDGFMMrH>y?ehQSBSKuq6w<FSRiLXJl(`MKqR~d_?9n;eSs6tr
z%!(qD+>~I}mb64i5XG^PTVwic4gkzeIY1Y}PtG<%S>Ls`#*~I%0_Xyd8vmevCBZ=q
zx&&o&6UPGo@q*u_))y;)THa<=9dY=w<L~*&5k*lF<(yEx&;}^crtea@RYs;<$(tYk
z5kt(TcK`7CdfEKhvL`F8=A*VR2C6&Tb%vo;4xEeYqBL)5NVZJ8O%L)fv(y}!XglNG
zGg{^?JRtiEX8{+5fJN+FrZDVayuZJ{TTj(mI~llko${#@dYmxG7L)fR9q4P8kc#4)
zYZ5C-TBx(|TtS+rKq1}S8PfH@ySaflE3x}cgSl-PS+4Q#71||13tF1*os}{G6JVpE
zT?lX9UNDHvu+z<9aK-vqxL`T&CV(<%`8}Txw}@59ARx$`riX0F(D}K!)qHMl?p4i6
zF9$Oy9jh8+q>wN%Do03y4>x}~oyK*1T3C{lf2uDMjge0ZYHSrR&IFu=%yJi4pw1hm
zqSC${c)C`Jv58J#*XmTbcHqxQH3(v8$=12NQ&M?PUE&7*9XE~X4}UL|i7Ncv?vvMW
zTA>p6;6v4n9ot}Na)J2Iisv(b_h4{lDymOMi>dh>EDu6f>xjjfn=g`a6F~*<L5pKg
zfxcGWmt@U`6GLq+BketY#b<tExj7#(ZgL{-GEK)@uf~H~t=?uwH?G`02Ko<Po}7u1
zUI`+D8E6YiogvhtOquhp<-;x1KO~X@Q0lrLiDTxe?!z{gS;dpok-V|2YDv$`>oVp6
zf?_gopRPpe?szwUx=B+v2P|pUes$v5SRSp-2b1SIsZqA<E?Adbgtna9Z;@ZlO!fAT
z@^=d@wm>y`ntakVUR;J8z-Arb5nP-deJxbpjoS<S_jjl847EJoKhII*GZ^}CON=pu
z%3|?K$PFvZ)JSZRASmxKf{uwgTfDH5!A1%O#?2SUm%rioGwpBBfdb_q&*F<j?3^6I
z>1e*5PrPJnd~3)_5LW>GbJs#(NHq%vE}Jb}p}aSirSx3yx8{f4BcMU%L_e6$?qxXb
zH(@>2H<0^0msBz6Nz{L9G+unhv=7}}TEtWrabJU-nOQu0!Lni;VPjPT&aHgU)U>py
zf8<qa2xE$J23>@ylkJmYv94xf7vqiOi}j7;iYO`@v#jDuRCmFfR9oui!2m9xbLDXB
zU^-+$ueA}_s~zsn?fG)`KhpzJQ(0T)cU6=o$7H{mn;Z$!bOPtW(Z+cH|BODl{^WUl
z$k{(>P$*?YhBWRBfb@Kg$^sW(>7NTqUSj_8{&^M!u;zsVo63`2Noj?GQEPy}_p%W0
ziY4H|KG0PZclh7Gdnn3keV+eOK!6;Hn461<FVpt0(bUD7JYy-cWnT+&#vaU518p>G
zU&Bc+RfgjY85hDi#j<={!#X-b4glVz=Q@6rqA2cj3xM9|$VlsvIeXAttqJ(XX-mp=
zFnHp%ViFpc;a~>(u1h-Jyj#AQAL64GO}DmN5(tndT0|&*CUt)AL#olh0%1et?CbgH
zLQv?buYAD-K&Ab;6A)!)G$Ij~hG-Y?vlT5qG}msn5(9euEC7eJVtcT({#%k1An36{
zaoQfcfqp?#Qu^JpOmZhs#g&hh#`QjnoqL|Lf2pQgArLwq{v)qe7uAtzO{o;ItKJG5
ztD#h$DwT)xeO~+T^T`qBTS89sa;vuNRC}SoRv-)9wIr5l-bgFv5#NbmSYZ~PY`JMY
z^-Q}e!;5NIHpQLU8QXH&ZD;_tK~8NQAS}+dI;7A3Q)&LEpQy^l$}+*kqN|W&xH1`p
zwwQelm8E55u}SPVqDb7!f^lBYiK=r)%9U~cxb=+yk7U1$w&`X9AUt_I6#t=*<!aNp
zCp-LqES~w)2|nVXw_$O{H!BZF&h{F)-#5558Vg2wG3i&LSwJpskm*)WPj$|>0sg)>
zAi<?=&FKxv;$!{*h%krA#>RHQIhS6tHKK`PL&wjk3yB+0*H2pcOzqf`K+8NXQReh3
z&4;>&eSdCAIrfLeaF9^(zgZ|)l8)^vto)0gRXTM)dndS==&$^K_Ga2xrsn5NW1RTb
zw7Jh1Av{8LmSLoQ=IuO{*8o-e#P<2FZR1Ri%jMNz{Na3N(B9g~NS@5tpFHc+{j;M5
z{Kff+$t~nm|248fYGJUeepgNMXmsqS8tm=({K}P+5j~NWFGz9wCmq%~8+t7+dJP0Q
zIO_x04V>WENrS`|?m#))F+;XT)k*l*12rdlroQm2Pwa<K&O_q`vWhBqT66uQ%s);(
zANrtFDy7%zl}jur&6wl9jjo^SpJ(}|_9w2=52^75<iNZAV$hBneZSTx>gv@<_(&zq
zhr^*FE?HZ8IGrml8w7hGFEbpP&kT>%<qU^q3(!4d-<N%70?&xCQ^m4Xx;94U8wwR8
z#YhXY*BJi({d>C9{g^+P^84RTo(v`z(Gn84jt8{hk*6O&vD<5deqTHP0<<F7=G>wk
zmJ8Ux0ry19j~PDN&}++cbHIz~>syBspn@rXT!LwMzvRdtZ&}>9**cRg6#3uxH0NA@
zxtAN>&G|ZRu~NH-1G6s2r-@&~59=1Rq2gEABYpAp4%z+zM**1YKIX1+*3}N8Cuet5
z%l)oK0clA)^3LDDeZP8tDv9p}2<<5Fr%bzP6mKc3#v?roQBoatYyULVJack9z``wK
zFuoqt*#mVjjX_ptZSrp5-uYqucEXzS?xLNZVX;}$I?h`4v9Hl;47#+8PD4E~hd)8N
zUA7SbZIM$jH;XR&92N05RgEbEn6wMOP91yW0~dGn)Z(~k+-|QIR5FZKBtu6iY`Q)P
zCBm4M*o}<rbzeDa@mmlcyYcyuE4(S=zIZDQ{*tCVf2_<-t~z}*>LldTC1bkZ@qBrg
zB_mUfS-|OP&!^NFyc2Wodha5Q{Bi|ZG%AZT=8JI{M>FZw7mam4uz3886tm6Y>8Viz
zZL7%G<>r7rt6>(CZ105IEbyMNp6$<%BU`=HAk38*UQg|DMR9!NmV-M^fk=G6hDnGz
zCz8n*OC1V1!my7^C71lIYYhnD@RFllfaTs|_VGJY&xF9N*akI-u9Ne|7rrh}g+EGP
zpL}6IyRGVZku(44d5Mc=guwZeMOHP5p4H&@S#oh&GK*mz@3K9n!u+bE-<p~KRLoVD
z_`_zvU^%qe`@>C^!Nvvezo-9g)fUdxr6U^|5Zx$_avR$)<W~cgVKGixa-v(&ky*pN
zrnl_1dsZ&b1$cXY&As>LKnxZKRQ&C7aQ+T^J_HS93{3u5KaXxm&&+MbF?EV%)m@6_
zVT9<nmLHva+iv6evR@L)&@3#MSfIFQx4_4ywGW;R-3Hm|wmNg{FP=A>Z`A^f;Vb|n
z6LxVPmayTitgI5&Qcz&@&#N8Szg2BMF`p0T<zcxj#Np<uX9o(6&zwCU*6EnDWNqxb
z+A#Rb#CkF#(vtE`gY(&~8!_`5x!F#?#mq%PWq-uxbkk|*X%LB;-7m(rS}@U=5reN&
z%RbGc#0u?h;&b2p5>xdQb(bL&FBLl#*v0Gvi=?fMDgT!7{r|GdrFpRf#<oIU-YOq*
z{58}L=<5?BACW7v0T?*!=@t&h>`1IRQvRv6o_XixQ{D%>w{4^dF4Ses2Cz|_^rf?>
zAIxupD31)(C$BB2IO@YP5mN{%k@hRgvP6^pwD$Qhzl+~4W|`#n3uW57eN<oz4};WH
z(y)LhLpnL$ssk8Z6$m<VcfJ;9fN3B^bC;2228@aJ%+CJVS%F->KFI+fDe-)w9B1VC
zXv1F>*m1;Qu)`0&N`hiL$KAQr0o{Q$u<-EOwQG>tDxF(Y-Q@PLh2WFHDV5_LVB9C6
zPRIboD>S^_>edE^aadQ`I+-^(`$MVb!8|}Ug$Ae1028Z~IMgevod9H$K~+<U+QaJZ
zZi$Z{scp4;Jy9QR`!d0jPUt)XSu}3PZ-ZcP$2Xz^@2vGzxrOB$r%;y&iYbO>Q^n$N
z03HDB%XQ>0HdM~J&xbGe$zC5wrT6-Sn~!7BbHreU(%;^A#qfW=flUkKYi-!>sGQEO
z2jZ?~I#4VQg*st3tR6Al$7KdtfjjT8I!>E=V5dy*Q@$f%9RK<5Aj3q~`m6no-+~^E
zz+Y$g*{)+RGlI|qtKNRU5O#u&k`_AUIBHe>!D}UoFN`&6%(V?~$9;L7;`ptVCt8b)
zyh?bNY^4VJfLqJH+#ZzOu%RR=Nyn_(B$9|;$<HYd5W)4Z+H2c@)!5^_L=$D$fxP70
zS$9ML;``17ab%k<JO7^$Qm<G29$N9a!xgG=Y;}u6Q|s+xV#9}pQ5Km&Hl9e2p-^g)
zrexa_&HPWlw~}u|f;XQcg9@q(Qn&8_b$Oz3u(*7m-#q@;Fm70)dkM3C>h}e{$6w5w
zmB!%w!VpHA6_cKk=$%gW>FjTyNUcuCmUv03Q$PQo%sg>m8o*8x_eHRB{y8UQ%;k2e
zGgq8BVAcX<Bh)@=eP^o5y3z}#ZZ!89l6!gdaAKOUsc6lzx3px;@aSCwFWK$u%J`%$
zdocNKbpdJ{!c~npKf2U?Yo~HqvP+^V+KY?+;*XWs%ZMFq(S~ECN5;b~6RXo9Zd-oM
z`7WZA<4O$YjmuPNx@?zqqtaN}4oooNdf{KPRzAA1Zlw<E#jHd)lm<29ho4?03_Sg2
z>w&~Ilt9pZx{Mekd#4dRH=53bk)EF@U*g=JU%}xc1^AJM{oJ>7ON8-Lap)g3TJ}q<
zT`B+N_JN%IbH%HvcS>P>k-8A2T*7oO4bQ{WHs7JX1kRDWX<sI$BG*i7%~u28%t>(g
z#?Y%KOuN!OWp0<p?^QA=eop1>%LY3P$dNubP74QCiP(bJR245!D$N}oq}J}Eg)s7<
zDsW-riu<Y@(KuruTj9(MEd|omhFbs&-LD|E-#!UwQUXJ+SLp-=2iuhZU2Op#KtJ%^
zl+xwZ&_jc5z??q5Y>0-DM1IL>0(r{uT9K2p0)}g-)|y8bgejxQiPrw-ne5$wFRX8_
z{$m}X#A_SbezL2OrG+yAE^|}=Ys0+=hydnr1(?G7HKH&mvsUYx<8}KV28K5tfymS$
zSc0OyebUFH2jT$05PkeNRQdf^-dJ3HZD#(Mtp15oX|{$y+_zv;;}WnTK`tcF5xS))
zE<S0;t{Gs^KvcAGFk5pdm~<|Mi|rxmvJnzMQWq$%qW2GuLnNen5FIuk7rzS6-*)Uz
zUC_Io9D)r$=~vPFM$*k|-Q=ZxW;T?&RVMt3OMQ07UodP}$L+wo9jz_8x;Jik_l@sf
z_w-}&@Ld{epENNn&cUqDHVN8Nsoj$MD2aD%#62)i7#f9jz>D1p=0?GKzUhxdJ$7lx
zD`Ah`idFIis;Z8NhT5<a_M)%tr!Jc>8_*dDW9ik{#RS>C;i&r+dzqL@nF?w43E~G_
zzR!=*tB-gjU4gE98U+QL3TC)0Q@3oA|C=qv7%JZLml5Yn7bn$sCwodW!o@hV6Fwvc
zfGj<96Ewjf*DUN%n{ER5-*6c?lEzKI#bVKe8v&BIA??5bUzv`$SS(|3H#C0wjG<Fk
zIM;3{=(oi{@<x)bEtmI@T<l7rUa1qcT7{cvaV9b35@cu2Lf7Le9=Z34IP={_tVtcd
z(Vp=$ZieeekNYoXxZ{NnI&grGr_4zUiVry&CAgHkKmRD2N$stV!8a1^nRs=_`R&kP
zdxBi!^ypM#gJz`aw<p=i;+S!%1|s0eP3gC1*qdu@WY*S#0?g3`#gn3$mOt~u=3to9
zVkbzmd(n9q)|0o^ce?yCh4ZFarkOzY#&Kv4DN$TrsXH#4Oy(}Jl`Sl=KvwsYmd8tv
zy*IIQx%bq4&9Yj4PxY2FReln8Bc^I%WnL<F?wHqnA8Z);>2Iyt?<&l~A?p^y1PAc*
zqwaDq1SOrJaCnV71};R^oR`kvw^YYrChM*HiWW4+5mYHl9mHCJuJpNNz}lB4B=k{h
z!oPpu1iI3rTl~&RK8i8Nm*t)N7+t;Ta^>{gPz-fJ;geVR_^5Y?tY|0E2YG(!^T-W5
z(cUyfWG7d4c4R}KdGr8}%XPKUpyz89kNot~uT<@C{Mo!`4QOF|f7>#nL9n&flWupe
zV2`i;_gI{91!9!Og|MksaFG^G9q%>$Kz4gwsNS->S@m#j)z(oE@KRtPKwoZ*^^ym|
ztV|UO(A=neN}5hNgAkq%=r#{K{Mnjche}P<_SoEpyae@u4HKaBn)^RH6|KK!CH7aB
zCEV5>|6y_Q=P$cGAhJdrY#N(QvD5hMm+wn3Ez8zZ=)ly5JVUKW?R1*PweyD2D8g^l
zHEzH|1b*7Wt~w-F!Y=gVh2vsj77!o8;pSbOu_*9bkeGvX-PWqPg&vfY-{x`O_1Wv=
zVBCzVVvp>~Ha9oCO+(v7zuf(2$Tqc#Tk-wVVwKZjTn6P>YDe(Wr7VQDXglHojpIJu
z69%U>L*P&9#UOTVxSQV&zHN`spqQJq3Pd!Ob>*P&GLJJAtcT=^Tb;8I`=v^re;VmS
zSvY@)x?_!aGzE<v`1wY=7eR6oS76aU4}3Aj|2osaO53dmj*h_I$c!rV5QLS!cH^a^
zk>|lx*o`!_uQM#xmWu_H-ayF$Go9iD1RpP0FxltF*@6Rna_MQ+xADhqF4H^vX*oZP
ze1>3aou?gVTh<q65&Hnb5YhpD8<oQoiLM=f#qjg#Rcas8V(hKo)TK%MHA~W}1W>Xi
z+lFh1hI$WY;><QZW<fJ)m!1Mx<t*AiXy3LbnGV+$=-+$n8g09hhT`pW(j-KwZx;bl
z4R0TpeL)b^!EZJ{m)D%Eg!ccchHoCe!?<|{nREGkBu9Yu@_!P#%enU4R;tdH-C9Yp
z5?hd3`Ypr&TETcpkPq%nHAuW=I9<=C%S-DhQJ1j~RNyqW9htc9QFE(3HG*@+c{MJ3
zaRHbuJ-0dE&FNwV0YdWZqR)!x-tdGGZJLd`j67dTOoYhfE3SU$T$}NUH}&Ux%a89}
ztl4V8g0Q!}nWGF0fV-%Q7J1=fGmq47FxD%#t4uikY;$*@C#elhg@679Y<)KsyA^F=
zeWg*1^C2YBR7t&mG^AqJk=fwY2IJizD`DglBiDp<Dm@0C*ET#uA0X65tH<)ytUhSD
z;(TvM{wy`#x@VVIAe*=%ZJSuFFVxOb?_xWlEBSJo>Z(S$`%#sZ{CM=h?8>W1Vlf+y
zh~G2gwKT&7xsSZ66@cq7+o`}}G#>40?OKjb;=%P3?MLI;(F5qTH1@01&s7;C;iHFt
z=-H?S`$#(M{SJWX{C{6)n+}z2|79>AwK4#I9j+D8*}AT#C0CkFsIPQ&6YXm%ylcrC
z91ySwX4&?}w?$l>A8$14E~iOSW6Yojy~@vAX`CDZ(Vn6jORpIVYq6X0KZ#G%fg)Fo
zwYJO)3Jc*qRG*`skC?jxig*DjsNl{`0)h88@IJkUib)7PIuDU-$~Wkion4s25wrY~
zxhN8S8GKpLYrOP^K)@wwxy+Fg>W)nr&p=e+xKc^xc^tT1$op{l0+!COh2aO{S4M|*
z;p%tce;yP9g&p&kql7xI8eamz;{1>*p{Je!M~k-vA}o=<X~@sU+S*Ww`{@Z2CI_0z
zr{jeeCkyBJg@A=m$J1=D{)f~$@#6SO@^M8P!$Y5?wih3bq+ztSdT(Be;5t0Xf1`k#
zLMX0Z0wHv9@3sy$3k$MRn`MOLUK&<ls~iq48-D;C4lD`*&y8C#^+XqAHqd^jlLUJl
z;8r?MY7*H3<_v8h;#7On7>o60=$0sR#!hxMbkje*^|O(f8Pu%zTon%HeLZ1(RRqBi
z7YdNipblm-mfkjM9H^!9^La|h`<UVqSYLi^ZnPOv=Wbj^ZNdRjjFkBG^I<EJDP8t2
zjADI;D?B}mI84_0JmD55Q;{>Q*Y>RCjzWVmZ=hVzhrCyV!Zt0p-+i~Z2^0whUzy`D
zn>AOUA+;^<aF>~BJF~}SKzxiQ5!$22ySwf=B!mfwqH*2pso`v#EB+vFCTVsN_P`_h
zc>vYF&i*_ltc7ZWcwCFGb1kn30WDXOfjDf?Fq-8U)^gSgVrTt0SyE(cmW42JJT&d=
zU||OG#rL>_clLDaz;+DK;txfT=@5&|V%%Ne*gu#LHDrujmR;lJJ_K0aL_Yz$YeKwT
z^Cz_gxCk<|PLo0Io;2kX6Xeaj@#JR&=$nh*PlJj{&ZZ1+8rmZEw;i<T0fH8L_Fa|G
zKtQPlw92g(MV0Wes>nqHIa#CVwyS)Sd;&HSl6>OUWgi$gIi>|V;GB_LDlQ$1nT-kJ
zck<4HmHc%~n{4fqG}eCoNGEy^MNvC3MWL0>zc5YIizbS_^S&E5S8{Q+{5GhF3YymX
zSA({mii(p{GcqNTg_jvBLGv+61CsPe<UyH`Hkmb8gcihe2lP$8Og@TZB>pE`4V{T0
z6enqhX8Hzh)iTzsqkc}ssYY}6J1zz7Ih=&E*f~n={(HLd_jDGqiT{VU%@u@b!7}`i
zYd`-paB<7p0)P$-P?93kvQ#omQ1yQ=w7`Shw9`*FHpZpGAL5gFLqkJda7a=PI=fqT
zCCCaSw8~}A_Y3wIa`-!?dk%<@!s;Rb?WeX<Ps2|8<c#XF_*9zDgBbJskl1Xal|VoM
zEp^T(=?N|_bp)|z2DMvN4c8J8;S`w&%FYZ>8$;2n5CB&U9JD<BFz5aXe?#(;6_Jj`
zaH!g<x7%GDSw*p<hpr;>8t>tZvJO9G^gGk0IY6_9UxR^Em49m(`ZOtyZiK+fl^DZ7
zX#+~wyEgJE68Z|<R+9*@Dx6DVua<*qatsroi2h3kY8%ff5gXTn$#~o}2k$d&e1LU4
zFT`gZX{QS_cfA{ByVdfVkDECJ;{D_OLZ~`=S$5#;?8iS%F9DY^<h@QfOv5o#45RmO
zS5|*q=+*|lLYwTZ$bC78U32XN16-@8Bv`&cZ0ZWI&O~`J^jO7}I%BZPKqp-&zP0r$
zw2>H^x1N#7t~O)_f8|&0J^;O3SBC<_NGmK^=j}|qJ#IRfn&jK2ka=L~98)-23It@<
z@vY)Bq2vh{Llz01E@$?1=bWsqgCVM;6D7$;a~V-58|FX%|A6j~3=1c?v)Gj+3@}*e
z-<aO{mE^P;V&KsgzVZ)dU*g8Q7^6TfX-vg01@~v4BGLs(Zs5`wq&1nN47K6kXY?cs
zfP<jMiVwDXc;dx1$r^e2W2W(>8Uy4`Sf8t=7W(^6DF$pm{jj=cS>k~gIm|w{RUDy_
zH<tW(wL_Da2iW~@0kZ)6w-}4+TN~6ayiNb!4#0lH(Xj>0ptQ50atv<|(Z%(Bym64#
znjgRmaKqlPONwGZ_PW#4ZypOWu&b>jPeT|j%985mpI~(;MD0daq)%qM!H;-o?0WsU
zgBd~}(~|kiL)YeR>EPVH@UPw&uszy<TjlhZ!NuS1fCZxYP)j!SdRmZ4UIg1_-{wL>
z@)1edp5^$U>-W@7R<0}S*}=Kj)z+}jD$L^gwH7yCMmF;ARUtUg{?f4VyFZ$}wC3Y3
zsqssH7lsh!9n?V^7@pBtIvSr7Z_`ItfqOzLV~#C+zx>vhwM#}1t|%+$KZ%v1wZBbO
zBh>PiHsf0-EiO;0x`;`W%Hr)j+z`$u@yre6_*BC+D^7^_!*~C;^I{1y$AUsD*CHCO
zJTn??6k0+gR9J#PU~yH(dd3dGS|Pv3m%SgXb_;J+oi)5V*9r*Dz)`;od;?`Z_dFCE
zZpqhwDareZU5$*yhzxuG;I_hthXvf53UbF%#lnDHjsoLH;)9fKzzgNAkHt~;ng~-4
zIpR~sMgReHi)_!tLLi!B{y3POdfZeZ2?HOP#TF&>D(|6K>GEDMprlL^(mWgUy!FF<
zs<ER0ym4#J)&T9O&71%ArS%$M^Z4f}+mjzMH5~L1*ZDbpE&p?{;?wftxaS04_Dd+0
zces50do1{frihb&bI0C5n52p)6?V1zgD^ANFea#=bZ)*o@%u~Jme;1u0;Ama)GM8(
z{RTjX!`Xb{MPilYEDsB9WLXgw7i8wGKVCl!#TExzlU{!$c5%6g784biV<lF;e5u8<
zsS*GYp8-aouz`YIu`-f#`Kpf8XGB4H^L9y}3O6%T$l-@1uW7#>-X2?a|C2j&!k@R|
zsZpvT->y>gH+6K7@r5amK?CaZzK3vFaVIZHF{Qgrq(DMqWd)^tBjBf_cDKYzKH7=B
zR53+P&<?<C1vi&z-vzRbQu@G6JHvc9*!)bbZb_%hKS!|iZd9(hF0>d3Qteo34+gE{
zC(W;>fv3hS-|ktTewn#Av<~$YDCl<F@n4f&hZ~KsBFmdDpPmWZ|1L6>vau61k)W;D
zo6ukHp4o}@UixUA|7av~;Ic!b+`RuSs=ACf)lYRr`;(l+w6E{DqlAjJ$n|%A+YZ=|
zK8YthwB^6y<JhqPQe%QiSd)GR(LrkC9h_pe9vG4#h_qav<v89__Ajr@ZjoOik|Jx&
zzSuEA1vZdwZ>wLacg$y72PO(5NDe9Z<8(!)|5TpnXHs61!aGu7m9EZX>L0-LS@kXY
zmq6xZyfow?4Ko!y^xcXpf56u!lM@Z~RVyRO8^+dbNz3f^LdC(^uhC`wAF8$r&jz_8
zj)txC3%K#Teo=z%<E%5G-kg8O+Ct}k2sBPFf0u8J+TJNWm(FACz3?o&eZ4m4qg%({
z+Iy{MA1$N}arATLV(IZ9h~zUgtT^8;DROo)&F;WGfAMCxqh1~rRdzXx6V25Z@ogS2
zGyx~{8D}T<$^(ij9se7!5Ks2)iV339Q2){d#_=1IYU9G?q6RAL=Zvmc0wH#+H`vu-
z!;j9;u0a(|1-km>HecCBYs8-mLH+<#EW7bYSX2LaT|!(Z{&3jOD$}^wu(SJ|=T|qF
zpAqke+7COK3#h(pb+&!}QvL3<;EQ5*`z&Cr?;i&2Hb_zTjIOZ|$->yyIzv{cAKZsf
zlKHR%Q@gIbN@1o0kcm9B9boujq3~Y+DB!3&O$8wIWk#0>Lm)xZz1cu>uD=EF&N*kl
zlS&F|&uDq@9ry=)lXGY?+>#LeIF?DBSy%2J9Dp|GSNi~P*a|HQkh(ko$oPKAuw*SQ
z{s)KiL_f2s$;{~XL+<|0B=ssRAQ83X7;B4;5V)2k^%Nk!;Rv@mp9Z_6+BrQ9<5skK
z?G!?m)|xB)gc5PFY7o(V85Aqa6r<8f;F61F4O<X|oXjaF|5s*J&3wzW<8N*b=Dh)l
z3k6owhW1^pZmUY{PBhdOqv9!=w=l)xg8y4Pp|(>KPbmKPsQ_(shY>S1YzAjW0ky%Q
zo-$0`Emc_K!uN-qN^C5pUiDMe#<)psbgQ$e%bv>ZOTOok1~Eo2M0+QN+>8H@qw|iZ
zdjI42u}9(@sZPi^Nyv;c65^2Tj+J8*#j$5b93>9PDA}Ww-9h%~7@1|2ajYE4cFc}F
zf1mrifA_eLhr{=KKA-pd^?E+(Anpp0{O&4NIq9|?P1koGx4t@-i4}HBUNO1;bi<xS
zBC~|KzdNwbDsno*JFWEMz11BOaCw$Anw3_iQSn26O&#i!PGnDyBE{-p!Fdcg6`NYi
zDC_ahT>j1#G*mRqkoW9go~Lrg^XJTvraSXTyY*;~f2(@CeoT8_XZ#AAa*OahER-0<
z_#af?#C&~NDr<sxUsoN9)!2x3s=jM^M`g^wP)A(Y?l1g#n%2eBCTDt;;K9<t3wjYY
zq*U5aE%V8uxPCPnw+uQJTGlWfbF$<7{MjjVC%ndhslq|Hsu~0hqQMuU-TK*Pq)d1-
zw^7x$+Y^x6;7e)!eq=rC7JB|Q?DEp$xU57|L&{}w3vF0J%A3M^0tbuu$g9+SA_qh-
z-+TRaypB(!g+#u$``pq@e%yLfdg;Y)6H8}j2^~a<LIi@7lhn~%HYWN<-AWSzIZ+8R
zo}3MoP3SM3kU$3B|NP$kdChVC-(UFCfA!aLYzjW-IATi(uccV!+Z7>nOK&QMiF8mo
zMBOD?WYv9B8sn=!RiKOtMP;{#f2noC6BxT7!v=TC!yq(iX$h%<L&naYkH>mR1@x$w
zVKsCO)IwNl8cfZ8{hA*Typ0e)v$va=U&rWrGY5T|5KrCAKAAd^kY7OkC>P}qA1R%x
zBmxb#S3Q`il#%+Aiwlhn{eo-h5RzrY1`zahb{z!!$4LHremByK{1-%SBo72CX95{|
z0GysSCaWO4HeC>*z!Nw7$nC{($iZ^Ag#RHhI~5Y&poAP&fq9nbyF_4X-P`cU{`WIC
zyYDgW#_wrCv6pRaSr3-}nfl*;>X5lm?0QI}_MYhQFC8z808w`$*lcdBO@?sh6HScg
zxLofdv*S8D^A{!<w4h@v(OnbumnFkF>`QV31xUb8ljcoY-?7?3H^?!`b_j@s#Im)<
zvtP+6D15@xnF!#|C)m`~?3)8Yu40m>znV^KY<99wR-0yEu_sGA5<A(4PcB03qDfx4
z8TKqV2r{yD8vOTPq%<EDJ?IkQ35DsqXDwjXEs;@@AoZ|KA~?VCz~G0{?&Ol@xE=EX
zEgg|M!c=!JGR3IHS{mgR@2JeK2tlrT`4qh>!_~we;#n{69Fi3eWeSqO!vkp^@t=`n
zZ7rKx9RViL3&YL7&67zq4B2L>0Y9aSG@USnca`Gh)n@9CGmU*~(o?`EEfBfzJm4<^
z8)rq4g#4KSqr}d*!^sad7t{=}<rh?-OZXikm-IrS4!$M{ak_pwC~=Z~x@c;{b^7y;
z72@yxo;3wJm79}V32y&btljs<>~q}i`)%HKkxH?_X4x>+<L@ut(%5pjK%254f8!i9
z()I4Slwo9`b@H~C{9&!0ZpM`r>Ch?YKJ|ozEWe|B*;i<(13|Z2#Yl~j-rz$*VUwHr
zs${o3+stBS_WC<C-Z{sP6!g5vuLb(N<^$k|$~+r=lG5X1b5bJ5LEEFsD0~T0o2RD!
zj#0JDb&qJQz=x1xzwuU+!J9AIUAN)4(ly)@XM%r?Wev!n$jv26Bhs81!_*AkIhd6s
z*C!32FGu`p`tD$<Eeuf!LjsM&48_naiEZk5IZ!^`t@d)M1=kM`as^IvvXT6IJ~`{V
zTPOR`Vq_dcO%-cS(R7?bdGd{U^zGBJ(x&F>srmN)Q}1+XI~J*S$)^ZooRlEz{g0O(
zvKu$My>G65Mu^gM(nTgnLT{*E8`Ok2cKvW<N#%!gQU@WgzJV3+UrLSQ{A^5HP;@3U
zo;rtqP-AhZ;OsfNJjiow%n;gaJ`43vztH(&VRzuIvdB%Phz<P{@~JmI*I()dd3zv`
zttT5yKU-TT4|kPCDcm&qaR0;O;M7tI9YpE$h!OyPVv68%DNQLlz%4zQoRGB+J>Hgk
z?P{i%EX^+3&2EU+-rt*^nF}1#Z2$p&k%ieYD%r5oI^K`F5jdUyp1Vj;nzJC+Pw{4?
ze&_#`sF(i{$J+Ymqx%hN?97r3iyDq!D(KnG-q@8|tH%(&9A7;C9bly{`oRa}Sn%f|
za!Sg(laCyW*i#q*VqnuNgq6yioLfnDuo^iZ7HBHq<yD!cX>D#OVCT2<x|H{CY4#oO
z$#s>ULAh~A6%b)w_^6>{PTVjtlzo?6)X!ug?ck`?Mb`$O;XR%RrI<HSf&$JlpfQA+
z3wFr1W^Q|qghW*AeVshdf66AGI#VKSw<>OT%w~d^?s*9l1pb1uJEqvroEZxAKM*W<
z!~Nrvy*V!UfwWdW%i*-XoFXUbi&5)y98DiWVZi#*S1i2t_6n?^T1zv*tfl!WzBpfk
ze!hDft^KTJ!+$9ki{rD=a^k*Mf42_Vwy2)_+VNJuG&|x;YMnwt@V!}%Q&eWjLY}1|
z$m)N;IEae&Ua|9#c6Y9-{_S^o2j-g)V8;EBcwd!Kejx+3j!!E8jJcD2`cMB)_UR9|
z?x!+z6FSXnr&P(Su6Lhv30<p=Vu-S$S9W&(Q*n&nwBwcI#|l#seBASxHT$YiFnce$
z5V5mNqItdqtIRkh>&d)ZpO73_q1+U24Za6-VIo!4oT@%DYU`E?=}YCJHc1gHmKU&|
zjIf@;%9iTO6>d8gp;punyuZJU=CjjzQ16pK6ah)hjJ{(d=XWJi2M~@R%mnI10C&D;
zhMjwCl7N?*=Z3#hUYi@tlp-htAeY}w`Ezhkw+9Mb8`1KB&H#uPlV|AN{~ga~8o2h3
zq~NQnz#PL!ECQ-3IqYCbWfM9ojVg2vQBqdE9H-w;*$2{u#y$Gr`UcN916ic9!c&^F
zZIIwFwMwZERA04@>!(kA^LOk`_lSo3Ji}HUe)B9^qgj=`s1poL>M4~DlG}?}P0yYO
z6^rcQxZIcL`0FQQ!`-EU;jWmWpTUm{aLBeW6*$sZl(FCqMEC|D>^%$)Z54Fz%6^q&
zdo$%M$A@%ZT6)J@M$)SUrl0g>Qm}qU-+pp=CF$1X*IMVy%;ScZT0>7m??tPDFnHl^
z1cbkBq!a`?r}PH}5cz|`9gBBcXXKu3zbysEA4ThUCg`2L*eLn7<LP+R4QxM&aQOFC
ze-zW&I?r-?{){)-(g>6Zmx4udSNs~BWD!u!>*OK?f7zwI1ltb&vR7Y03-2kx3NUPu
zyV)corWj3%VuPu`r>CKz4D@)WbDJj~O4)bj7^0#yKjF!NT<_k;;lK9q+49_goo;N-
zgpAb)4p#QJE;2>x7<K;@=^#}NIWtqoQ=p?Xd*#uUg1NWi`ro(NsAx7Ce3w^%UVwu9
z)rNH3nJE6ihjl@wN5oJ<Uw3&p+#E0FCc<(tYmYA(cQV;@Vyb++n*9zHob^ZUHiDV|
zQ%3dbnTI&4OL$HxPeyWmMt`y=w_ObBT~QI$il3oYUG2T}OQ;Lr1q^9sU?IG6_AdJr
zR;blSBmE8~o~-RY*^JVWqHnSxqNrM2Z>6ACbE@WKa9jFupu&|6;R}zqvy0`|STO#x
z<8$!lJWQ(Nlh@Au8Od0XU<!F&M<q|&*Xr>?37e+1{^!F0BjVU6qXi3C)2t0nTEf-i
zikgP?E{>0l<JjlSH$G0K5KbXjloq2c_dhfrElKxn(azjBn(k)?d)zs!)R!X7b-+Qh
z9HIMdE4u6~N^A5$#lh=l<fTQyE7IQft-2kyj~*4M)LxaQ!7}mDG5?jI5)z)|7}7w7
z)xC(S5Sk=F5z;T5Fuk|0_1}>-R{6u>9n&gF?|5@0{e4)V+Y6Vw4n%Ze)d9i;X>`>?
z@}>aP7{>n>{>JHM8T2oanO%ONCaE5MSJF;H+LhCePk@U5chBKn_2d9`K7<>-MjUUl
z<cd1;byAmD9+;iu*u{ZM(Cm>be|7&;!E$M4l+vwLe5#X;qaELqUTOy3h#iuSSO?q?
zQbF1O?R*9KLd+olbNfg8FimxPocv?=2kI|zb~;_ZwX#d+d=vqEkuq1lr*WBEi(ssw
zZ)GI;eB$%^Ek*&ANf+qE<FV?h#nK5$q=#s^e&%qD)p)6SO|@`C$WYz>zjHN}7*$%s
z4Y`tL4pnXx9F-Pf%k3TgYg4h&XkE$b5rXY_DbF%v^_Mz&RSOAz`TlRQ)It%_F+*xB
zZk8xM8G`Hdt-An+{Bf8%VcPNOIhZ=~rW76YrhxG~5|rxu{ob=@o4>ufSC+W_cxd|I
z>5WT&Jx29;L!bt|GqqEzVD3FiPu%v9t0YD}F(Xd81ybf+dFU0|n`Kt=$a6WDRUgsd
zgaf&=H8YdRi)GHV^FN)_N7{S9c*T&Ilf$bVq9cs>^HmgaCl?71yeR4OR0a-q3@9SX
zlvXiQ&pf_Rfi#Hxl!(Git^3~&2Q+EBrYT_aNFG413f2$h5~&#nW%gpv!oZhq1&HMV
zhlrvQjZKUc?G9~ue*VH>fUSOs6a;(hhL|PKjhllNLk>2WP93QwwX)M-s|ixH2Z|CD
zU-7DX;(g;DBjmHX5@!T3U0<@lvsmoE^vrB%RT^>ZRY$$2sRmi`367XR5p_Qn9klv4
z9W1#Yuk4;2S<CUnk>D@}!zljQ#HhlW<(f)34b&wPrm6#@dUm5{t!DMFTR41BF5(s|
zs5F}3Mw6v0heLFwR{aL$&guj=W?+P0H!xB)yZx5(!oQ9`_At=@0yVBe{qXbSDiFq@
zWq+}Mw7;*BgKcYr=^PFI3c}-~nXgd&jjyQ5(gb}UnOIAKd<!k%Dpq-h*eX+}*U+&0
zpYOZda8kEk(Es3&3gduTlxgrUyV`Qu(;C7y9waR^!xEx$4#VBjpCPEr94sy7GwnBo
zM4j+S3N?ZQHH2bKJtywY)9vDosiST-iMRaH>h6^=Dy$m@O5+`{{px$CsrjGQYg70-
z8Q$>GvsTVFYg1d&=0;~*+?MY){RoG%R1&uXl}1dcMCn;=`KVbl3^l$9enU#oUa+1>
zk549!|FAIm&vr)sS!FJ*LNVBe1D9CVpypY*qs1d%tNh(s2-IFca5CpMQ+&M^SSYwl
zbZ1aB`8D5GC8Fk!-AbLSb_D(Et)TyTRr0<&Y~c)?w>k7=JPy2km-t)_gkTbHS}r*x
zcjx+#t?tM_{xv^F3&GleNm*()n#!fIFVMR#AJ0v%*Aqq)3AZO-?nzO@+^|k6A9Ap|
zI8s6sL~s?gIfTW!P*YQ8vFU*uxfKZHRRP&^O<e}EbDiBR3v;IK>a12<l)x9kWI^36
zr;lu~@6BD8HyAaXRY)Op^BDv2gDGz-`lGs<r8UlT*U{6r#X!3hgwj%c&N-Fb6C^*^
zdEb#k5Ehd3jw_7pn&qEl7Eib~!F|^l59#1gWl*CV6weudde}D^XFs7c%z`x<y0*fs
zfm7M3X$-EJT8I4cQrhS9s+j^)MtaEFI&WYym=mM1PdGb4Ju6#3pZpc))VH~`zZ5L5
zO=in}cI&kuUJRY*^zTdv<~%ejJI->J(odrL!^W$G_$i|sR%M&s0rE{JUP{&_rig=;
zivC8;rDL}B(vdoI-*b=2gsS;2CFv-B=k+kTmrz_7pNFPA|A%O>7c|ruvh6Bi#^7O~
z{>s)+4GKzYWkK5;KLg$IK*#gnJhr=BQCa!i17U}n^tCnChDmTyJK1OjCX#Ec)6*WZ
zF*H)kjtgL<214>wpeFgX#Me?=OISk?F{PSV&RK9Sf9Th5XS}Rgkl7h2sf40ow24>{
zL}jb_;kz|XobO9$pqB80t$=DXWnYnHaQZ~N{i%s-)yY`)(J$Vkxh4hcQqx_zbbb{^
z#s1tRuhI2miFe*S`AiTwNvbq7u7$-edVWly#QG|8nDH3zQ%7p7uUN>d%aQyMqw0Kr
z<yb(8wtMuZ(6H0Rv3(WsFaM|_`dgt1G-?<7k79&bTMe8Y5*rQ=4<io~86;Vd9M7NA
zJ))`*8!=1?;MOo!OY|IvQQVqnUNrOPB|?pT8vC-YJguFu?#mx6-y$|Q+uqD`F2Z(6
zNqU=&5W$K1ezM4*n->38BE;a6`&)(ci1EsGM%>~hDgJ->hB?xKv3N+M?jTYJ#Pl19
z=%9<W?Mw3+I}Mj1J(M~3Q@@kWrbXw7laVWQ*-en~IMLfRx|b9fwWyA=W?S{Qm^IQd
zjIgiu<auDZ$h8qEDr#e-uQC!u&63j3*&azfEwrMaA6CfY@A&h$aqt6rUA)S}vgWO1
zjhM@X+<3^0I{_9=K0LA^rnB}H*R^!y`lr)!3c1<6B+(&N-M5FQmFB;ekOS(XyX4lu
zAEoMhw6ocT(y)XV4N}1uI<eaL5pAvr+q916up5Pp37uL|epHTv>|e&02M0k(!SW0{
zJ4!1`EuN%ZESN*hPmAn%!T-&zhO1LJS3ty3l7%Kk45ZoD|JGf5n0D)d<O*}TVLD^f
zT^mDpY(sf7n@#O6t>&0`4u_AUf4e8xBz|O_j<y72Y^tqKe6zR9<tuoF!6sHDW$F#Y
zO(e)p<qAa5?i*YvKgFA+r$+lNo7BT$<ar}39fT=H*MQDQr9so(8QX(QfEnF3lz}wE
z<woLBEIO0eA%pX0S<d%PGV{^LB+8{mJA*C5t<?!FRGX{dy!HCyvU_#VlhTm&n!Zl^
z)n`^FhIg{`s${}%=neuxcoZ&dCLd&0`}O<#`~Cg>-9u0N&&s!RM360<&E|&>8Eq6Y
z_3&h0tG@iVMWFFGkh@)!lkXQ8eBh(NWB(($8>~K3sM)uOS(t$1gYDL0Q>k22wq4NX
z!XPcW75cR?lPt~?F+J(nB*tDw{u|OT2Id~{U`?#=>e&=m;T`VF!=@F1lD4yaCov-9
z9N*cQjpboAtxM3*gTblMRllicyOa6)*(D<w{Py&8%dQH!K4foW+N+L;uIQWrV5>`E
zRDVFXw`StqxU*Ptw|cuB3*{>mKT|-i2aR57%8HhcNVwe1h`@E?t++&qxZ|jz+)9!f
z<Kg6DuLI*(*IPq6zLrE(lIireobI&G!LY>1ip24?P`IjxnfKTI+d+83tr4O6E?Zvv
z=r5m(5v(&DDZ2RWHLc=w<ac*mI2ON4ynSx&>zrtivqth7^-v;;+^a07*H7_ncwu^U
zyiDK{a*vR{MH4fcJ7Ut{BN!_c0mnVz?woX+LQh>_cNEWHUqVb)v#COF%AUSc4O{f^
z_u#g~aT_vAK~#!;o0P%F?qz7f&|Et)=;x^}3lr-GZYgxMD@ph^_u5WFVPbucDN?`=
z=3tU9NYARLe(_WW#%}mGyxFZw8yKcXP)QELxua*dm~C1m*LFd|!lDm9%+D6OE3tZN
zeLDCy`*xnjEW<AiBvSR%gS;B_!gbn~o|%!cujPgs)-3{%gI=s%qk&NuFvo{3h!%yy
zP76#t(^_O=a3B<{YGZ7kq<$W*ViZiBK>1^H(b{_7<s;bVCH3&QzxT_gm1Dh+l%xII
zn=UwD9B_EPNg*5}YvzQ}?fo6n3W$EjcUu%mbEQXf$}v(Qr{EPo;-KRcI=bLlV|xav
zDcruuUW#T`*;esy#=qs;GBJM^wlv7{t(j(cT&vKrz9e^unI1Ia)gjqQ@q;~4l(Is_
zJC`KpGVBB#i;20VWG$pfy(q??*l_n8l11~8(Be2hbK7Sd^C2M@_f3=Uzq#n<Q}ZPL
zo{^?W4q-1$8H|*j?bT&)m#9GQ{c_pNjc`3vl`iTlZSP`qpY85W2L}vSJ@T67C6wB=
zzw*${rh(RVM$?;z<$Y9|UzwXA7<J~rRc-A5g*jhTujt8Wbu^_@+qrqN!(=LR#lQex
zKm3H|qY9+uF8>Kl7h@`8psC>oG<qr>^=AV#9%tf-Rl}(H{ez<|Bj;Db%8I|&cY`2L
z1A`SZ`%b=p1t-derdQzu*Vpn7jJoUf8ZNK-VWD!^VQ$KU?Wc-?%@eJy&+yy(dlr~y
zN?9p;;Olekq)-00u)68*XD9oQfo<J#zI*ej#e7}Tfye1aE1Pmg?7hzN@Mftz#kR=S
zNtsxw_T!MIiS;8rwE6MYo|C1iJ~=CIdN=gczr*PX9y)Ntx_kUU&d(`qX(m#Cmu*-1
zaBLm=t3ULF;@))ZXO=|X2HBu{Z3~n+FkJ*D_`N+^$e>K@Op2jPZkFBxf4*nh&f(H<
zF@s~;0!H{cgnGHY#PKq00dT{C@+sY~&sGFr1<xQP8JZqN#-lMs<!9%NesV<l4OTWy
z#LX-=uaoGQynm4E`zRKe2CGuqy;r9j`7>^(V;8OWnf^Zh(l25d<M_u+T{su@Bc7t$
zk(-Bq$!RYm>F5<V^m)0xG_icLRE<xNA-T4P&EiGmqS*^ddCF3}_YZJK?yhfiiGqX)
zr#QJ=rh4^#{+*ltfy{qLy8Zr<1Vb(t4f0csxCnBzfjB-?x<duMf7XbDs{RRH44IyM
zmDn5^4<^Wd1a0y&S-q1?K<aDeu`U!ZHhE`~u9*y8H~fzs{iH#MeHRCX@q4`~mnWUO
zQS1Fa5iwAB?wTM%l2<svK7lZ~dpFi})0R8)?wrh9Te-HBp~_c#Z=4!*+_jH&1z%&G
zOYR0dh`j{40i!0xDxCKI)HUXSUfyZ6EQPfUS@d_ev<2H{<kWm-gdiVky?kv;u9oZ(
z4uYr48n1RNzmhtjTC_T2olK2&tAFCix{~LENWKW+<KM5o9bR^y!Q<o2`|A)aN5aoD
zb<26pEKP?MQ)psURbXgS4L*)3pX`uzQChk(ao|=9Er_oExp)D=`b+NTt$zlln;Up8
zDo1+SB@LwT0b`_veHsA?^L=-bi4XPw3Gq>S=l_|oOZ@E>(+iC{tJxb*!$$*oqu)av
z$@lv1u+Gute=*z?5j9#aM&5T=td$2!-1p)by1F+m#(aH_`tn}fEeovqlinT79*W?$
znx_52O?r8?M$$bi!ZlRi#)fbP@+L*y(4JPp=8r|JYf-@?#QPjVR3#CK*IwX})3V7E
zXD*6?Qy<>pv1fcj)$?;CB>KIv(T8*NY6b?>H^w8DYYrUJTX|z(i7vZ)dw{s=$!YW`
zwclY5<?KrL>+y|;n1CZ7uVmahiRA60R3$0s4Z77dWPm*dKdnZ1YY0e-rF>&sSGFig
zHZ3LnUR|e-9RE>YqN6xF4d|^!oT@^(J>AaEZh6!dinAG!Xl>zFtuE;G)nK=(7&S*`
z$~GcBj)dF>XjcGG0cIi+a-MA5OLPFtzr*z=<t)9EwjI(`<FmutO^4<HI2D^fi68b6
zJ`)zdOP!#6ctio5@q=Kc)9=|w--8KnB9&p%ouI(^{|2W>ZU=n#aMDedbp{y*vFWO-
z(#n;jY(CBCkfJ?UE-sx9cow460f&L!t)kK$_JK9W8|F#vbJ|k*i3llbDD@g!&^0G{
zl%)|s^N?lTcMdZrTBjqn9Ogmj&8~~GbfkX*)X1$(N2a9EQ!6w@a&%^MQFj1@F3po)
zwP(6;*)#W48N~_9VjI*{R3rE-k%7XNs2#(&sL|<z5SOCZnLj$}%1gm7tcdSuX5`o(
zxQm5*$Z-Q6OSYEC0DcO&!Ijv|0+sZ@vgpM4T$!INXv-QS2*Y8{#pLoGk_AH;7CXQ#
z{LQV>xeu1rJ%BbfSkisOb^btWmBj>Y@5slIV5Kc=tPbU0_&TxP>tcDqICY`8Xx0BV
zqTZ?$d2@k3cX8oeYD?MicEc7a&D8EZ4b~QZMA*5T8yBc(cDv)J`Sq%LZYHL9813mA
z@7DwO(>ZA@LIfiDX0L?GK;6R}p5m_Kd9DnWGJSUXrAi3mi}jm!sMM!N^ft_0O03&l
z12Q`N^tA9>;~(7nr541hh4Wb;HLFoo)v1QTlDHb)K290Kkr#Y+jI-jxI}dps-Cw%r
z>U}PZ#BPy{s~M>)bZry8{pfUY<Id+`bZ4mMLX=6zFQ?R5@mrL+gQfkt35ZQkJSES^
zi?-WDpdqGpG_aJ8ScScWFqbh|8=b_=Q=Tz+q>lMEMlT;v_c4%ch(T;ppAuDZMdc+J
z@|~9jB7Jrbq;vSR>_q4hIT|NzT;=O2)%P>m($)A_1t;&>h%5cp2(3vDnh2%t!k?-l
zx8I$I^G9C_^m*GDb68_VcKESKIP*I^YvWBrIg<+a0^P*7W-dMtsLWSJ6&J2d_Xj_5
zpXtMfp;hibLy}Vp<mI%2Foa<f?c#y6Q8XBXD(GnYb4H$0=5UA__Z6DEk}4=8iG}42
z^fwMwdeoN)0Y)J=Cj+_(<TdO;O~^s|>A(HGqhM|?yFAd5$&lNa_JBuxNJ7VUI#?O~
z;t=+FXQh4q-C@IR*#186@zHAWA<!J4kUNLZ`u1BR^*d)^(BI6?HZwAL89jfz>Jm{A
zo{(^-nk%tj2D`NHWSJakTDlg|`s&=~@!>9jTJp^|5b{Hf5q&^kT?=qySC`uMPL2;}
z20>?Jy7uMh76~BpLTfMnA{~N;^c`;2kXtt>@f4Z0o-U(?F%WI#;o%9{@3ubq&U<?Q
z$C8J?Y`-+D4zoAY3C`U51noY6j`ifuG?rv};a109MdWlhs!mxaq88%f4v$Yj+D>+T
zZHE$kyfeWI0<_5iX5f?%jcbj!XX#n-w3f<CwEK}NoQUd6uDEjO*DyIb8J60YY*_3Y
zT05b8`zCPofD^pq#$2-X5SmhUJ6qpL)67e(8<fT`>fc%!Ao|*}hRmacr8+zfB9)4l
z(Izw=zMng5CBI-Lf`)RVj^4nA8@BX%gg>~=`l>=0nxeYoSRGBAAN)GD%|<Bz3~e}c
z#CH`RRAhvCodRt!Q?%#A6y|j#M0I(x<bD<xxrUH@GC*`NOQLI5gbpW5=_UP`u_+>>
zdtBQ7J7AoB1Y!n<0F+#Bc)O!Aot0jTv5H7KK%?F7Nba|>0UmEM7|dS#gg&e<!KE|o
z=b39jo)8{PdXyF{hHYoT>>>CXAz-n!(~=r0uO{XC)8MO8oz^Fp#PkX;NBnoSAD$`;
zJvg)R$wlzi%_jSX95xn4NIc!pE2`geZy8m(RJtY_AHC3?93Lxjl&>r6>zP#!qNMy)
z!*2&G1--k9P##oeoROn?o;p$AvJNHJ4-qZS&Bo3j9^Qs*U|mixwz#(b)ljR?ZEe;F
zhZw(Gv$FNDa%22udu3^-bzK90%Qt};o<bMFYJ;VM{EV(7@9)o`qVIW<CZ`Uir%&_9
zUo-R<oWP(I4nb>;T<Y&MEF6I&ZdgkIEl(tWBy{+F63V&GLR=5GZ@VAw{k19dCv=Fe
z_rts*)Xr=KU7ROM7)IIu;Ebon*_?O7R}1n-KtJXS732D!_PX5>W<Z!A497Z}ZRq{D
zx?Eob1uea;zVxS*4CP~w50dbhcu{BfxcHe;Bta_OUD>VDcWdn#rl)#4<@9?!Y?;Ys
zu&T%zGms;cg1h9i_IFdtveKWbyqE6L6C56cuNoO3V5!WK`ErVM<KxGB(T_qd4pK#E
z%JYTgUwj^Q=8`sp-nsmT4T&O}kHQfIV!iLbPpA9tCmjGQ;u(5sKm;b}Bd;!?C@AWW
zi;Lrr04&AG6!qx}f3VKnp`gjWnWYrRRb(67d%a$~n-?@nGbF5!XY+*z!1`KtDwu}R
z2`<t7weAiy&ojqdsQAYP+w;^fn{!1ib2OE+`*Y>K4!3g9>)$35l$Div<MJKE+1Pg1
zl@3Ka<N*2?6jqgav;RA@65Jl#4__xW9)f;!r>B?tlNj5XJnBv;@TJrf2R(P9CxT9E
zPUok%^{<8Qfld!Fb}&HXc=XBj^Z&t(I>0r^E0(Yej%}e*EHAdKw2UduYM=37oh6Y>
zE{@#V+5&^%D4Zs51h?vFx}`+&I9ey+yBP1X*#chOJ+^-VoXUNE1z=Wv6sQ(&NR*IV
z@RlW}Q~k#)kChKMHd}%F+Z@w5_`O2iSquU-<QAG*^O}FHDy*(9)Is{>9m?c}uL7u~
zV>e!Vj&SG|sfCQpl=Es&#<hh3lcW&u;M0wd5JQ;3?i+)tu>!-qE&L9pefJb88<sHU
zy4Wb@QB7GRhES=*NX@Ne>|M<L67&sCZbKr&EUHRp<YHzx>}bQ$m_C(CCJm&*FJP5M
zB$gASyUPp4r#)0u4G9)~IRIlF-K;s|COiA38jZ1Mx!lHLbTAO42_QVlMi2Pyj5iy~
zQ(waC<GDy*!#0)&(ArWk6z)Nw>{Or|rx9la3nc7?zOMRqaAP8hsvP{$WsbgcXe?+T
zyxa5ooN87(eeUPTzmU;l_VU6n?sqIAiikn4U;`1S8js^z1b<wSW(<RUsp7Pc=J%0`
zv%A1x5?DaT|4YN?-?g@jB`2O5Z4(?1EIa<5B9L}=##1J^RNp0FOOKJlp;f@kS5`M{
zjEM<j=01o}!}5{ohLe0-pu%|xtD&0<A@{JvlJ0tw7HEYQ(As5{W-Em#@aPM-^C|^P
z(k$r)Q9czbve;})?yiGh_VMIp?*cqKMUuq^WHVq(`+K4yySpl~7Vg(wEtAScyUQg#
z89!_{iej|vv)2A$BR+89P+c_MKgF2q0{)zVJ-v9|ZWelLKbIgxGPWcxeECtc)7cYZ
z)cr@WVl+fU#W_&=gb<v&Yg%@5Xkx?j_H+095*~xYQ=fR!WqW$u1Xv8!*>5r+5}$|j
zS@y=LEwZze$qMIEQGL-Lxl0vP<6t&YK7>ZsI6f+{Y~NjIYs2AXZ_*hW-i1X7-|!Nk
zr7<>G?R%ikVGyCKR{XqXPO)HleJ<GFf4B5xxJijeKXk_Zs27|yIiU}H(>jhrlI;%m
zmH=6ia-A96E(yxNUyOb2iPMkUnf@pt5Me7LEt%mLsdBXC<9vKEvh^gm(PH819B`XV
zw)G0<CEk#ddK>41UPl~z)Sd~Om)9M1<MhwTDR&Df5{`PJv*dB=5gd55*5IC<6_-%c
z_VD7T=xSoUrfi!nx3R0)Me_WvgY@Np1&$+RCfGV_5OR3{`m6Nye^CgT*8reDK2dC1
zihN`V#LFr6T3-^upWGn)?KOcFzuVH5BXhxS4rYu}b3OblDmtP?l&^}}dSo;xgU$!+
zZ)|?lyjPjg3C5`Y>!iHI63VuZ;^a4&RJef_!3tQMC6X(qT*1FR+GI}v2&rw7+)wa~
z(on!Wf&&qJc+e{my8lYneTrx2kd0UQ&a*(D3|&S~!;TxfB#f($`H$PbZ<Ijq8l|l(
zQvr||VB`h9MJ?lp(HShMsAcl4IY(Ch=jJy$JO$cN#Z#qj0(CrUnt!sN73OlPM1OG&
z5N~;LlY6r!VFmfl^4_CAa@j;o0j7|4Ht6gWQD;U-wg3;BZeSt?*6?X*aZ=}d+9gVq
zY;s74xfu|H`p@JvL(<uxoBVkxhIl$g>X)`DkOcX*jOoDFQmg(!H!~N;pcx+4T6jDi
z2_W$tk=i)ub3;v4JsL#xj~f!?^7#zomnk&Vu*2$pSN1jIMN4PbZuESB$*-mjzY|kl
z`Ue7!5C3*(H&3KZLA!oJwyIBR_tEZxywZG|p#=o#NNA3>O3K5tCkK;3#vvhrUOw2{
z$%?dnVHr6HOjBPndbxV$S~Ne4ynL>@v0KaakP)Q&w~VXvpT5|_tyD#|F)7v-tKxP@
zR<8D^@Hgq@!05ZZOt)h4l?4_hZ_#)dp|&DoCxG*WpYz5!W&aJSZJpL*0dMV+k^TLr
zMfkPd2I3)@d0NtXs#Q1qS^(D`W#z*0eXk|LLPpcx-)`_L=eNbI1}nXu**n-KgEim4
zt#wl;Z2-sORsNbRPEvoD3!o?0=N}PSE@9Q+?C|L4(e-}{hDAie;kPJXD;|wb>0ZPg
z<lDANy>Mk3O+VkmC0K)XjlSDzt<O@Er4wrOdSxec<=!ccO_2n5XL8ocj@b=q&v5=|
zCR%C^gC&cgQbJ@z!MJ)4*8B=H^FvH2o|>A$PqHAox}+No&IQ_d>fQ%0Yl3M7=T-2p
z0`mY5El*SaC6@k)&|h&ShHPxKTZh|(&Y|Yv0&u$lAHeQm_R;LMR!OLBpPJaVPvHEu
z@pV7`j!Bu2qb-2S`7eOTR?6eo8jb^J;6Q=`#|z0>PsX0yp_KAV)x=VaB?gZN`OGo7
zn4(+|tn<HUKd%zvCsjf9_1WP(5c5(LOEItcYfN~R6`vjLCCGmb+I39RxM$7F8tK$k
z5+i~+-dk$z)X&EK^re-myU@paNQ1c9B+0CJhiB>Rdftm2lVXq-2*|Q$5lKVwbXw{q
zd83mYER##Xm-H=_8AeFo7FWlIst8SfB*fhGT1)!vEpsv<9k)fo3Ag_av>bUS-&h}-
z>+h$SSwf*}Ir0e#K0+cJH8J=d5=9c!wcwy`<_Yw5>wXQka6nxsttfIK<vC^e-7dAt
z^9%L2HV4Bs-p(&O3jL<nT36YE9*a<8VLvn)3{2G^D$%fng9t{HX?U@7Vf$5dtK`!F
zihR+8XS`f~46di;=3A*YCvK#VnN~3#&_sFwL56g&`Ska5=9Z%!!p^YM+Ke3EzU9VC
zYnp+%xxv|{XUo0M>`!z7&%g{>^i~P#&C_8^%g2T>c_6u*lgDl+$GryG@=-YVK*!Te
zf)Z@}5~U?<Y?LzQ5Dt->O-QBkwA98Jf8v5fLQw(;8js{PJcF0;H{v09kz7#6S=NVY
z8ve)y9n+PSLFmUF6D=~~CG@v}kY(U{G`8*i1fQM}+O;XvG&CgG;Jh7V*45ER8vB8=
z47>ERwEN__%i}v#-!#7e8zRB4uzwI1iE7n`5FIuLZ|6OtFPYuSG+~?${0dkWmQ+9S
zx)Wmes)&CPJgR^HxQSb=vq$`eP(dK~k#~)>J<&D&srCQ@sWAK#5|~V+{rvR87m3r&
zh=Q}+illlX(cF9dQ)C+iTOSLPs+?18eUgJqQHvn`*f(lW$`rqFWu2sGDNZiW%kXPB
z2JCn#-I;9#ux`(a1uJ>9w%G`YlkX*5`Sj?O6QI6>v7B2?(BYcp`d@jv|Av{X9iD+9
zfsIyrtdnba?(Dea!Kf%TTyo&e_QR2NyK$;(bo6{0CKoTHaqX+4F;!bc2!Foj1Xm>P
zeN97z{~fNb<(-H7Zw4>BSb1)>Q}2DOLyVR-$XGM9B>5d&;f--0Um*tae{<dV_tBcN
zqib{&x5kWZMi90efz^In@9&143hE`woP3!{E*zQ?U;o7QS)r>X1~YkE^1+Q!lizS5
z8P*T}Od`$$OgHc8^|*Yej%UIl-WK*^BtCdOVhLl?{cya*`)nKRRMt<kLQjF}5b*JQ
z2+j54Q>Pn^JtLq;vRbChy|yO&3Di(M6tY_0$F}!m#pzRgVee*^g6n!(<?=EDUCH(A
z10x{tchH-T0lllLo$8(n8l`{n_B#CGa0~Yish4Vc;8<Kq6~Ez?8VW2#VB4ZqczDd3
zK&)b)R-D+K3JvuiJ^k+}`mcC<32412Z$A#EI;9CpH&}?%Ly)<~jI`AF^Scv06br4?
z1q^0;ZLMr#{cBPu$BzNxu!GGEWoc3FE?e+@VJh1BKWw0|c6e-LZk{yO>X+|?p3jwa
zA`FcaeLZ=D`(6ku^6>UwTM**B2f`b@;0rv1S<V_Xz@dijc<tL+Ss}w9^r&`Cv2b}?
zA-gZxBt1=U=09>kY{Rk8<h+C6tmoDkUkCiz>;U>+IB~gLv}aazD&uX9Y{>mrQbj`i
z&=1{PzGpCOtU<;2UW%v)Zq(U)#5XIsxgRKc!1>?PhmnVh_TI5L#l?|aAMFD<Bh>-Q
zZ@|`oro=~j!vFo@;9=!fqmAV870dnF^Vvw|6V%-@EiFspqKe9+E1L1#g@0~4Az8SE
z${r`A!zY!But};Md{YNcYuA>1{;J?h@+n_#+OlR*-_bQpvE!j%P^lj0zTd;GsQEHg
zs}Rem83Nmtxrwwv1lV#HzeB6nr{*Q9g6=H$4J?kZO>|zzmyqWaYKU)6$?kt_Or@h1
zzkbI1`m0F|uHR8ST{?2Z6mg6;+hFd^(m&beE&4HQF0uYIB)BnEMCJ*<#_5kdOO_H!
zaOQ$<RzksUlA)JY16TJXTm?Dk#=zw!xmP`)n<2t($K5G~{wD*v_*5&qxBI<4uDrj}
z1}=gPoV8vLdl@fx<46UURPep4)8fZ8$QskMOAM_1P)~!aOVpq%*8Ip(<Z>Hp!)9gT
zOeZf9b-sY}K@4Q5KD@aqF`&`sJVLA0qZW)z*S-1@Kqse7v0oZd^uvpga$n0cYhiG6
ze=u@4fACAq1)3mafZ0Wvo2~U19*nN8cP>FGta4QD7EILs`ywe?>s2LHqE#1=w6DX=
zbnj@vR`TxdKA&m*>{<|-eu?DF=7Jc)kM2$OTd6i}jbsjWTl{mpaOC^0&H6M+H}a_=
zBF&BT)%C&m#c66Ht@GT?vZ8vCCtPiCzh496)exMxselt9`g2uj)oz~U`|EymYR*4l
z2`{gDTy!gcCP}Trsz@7M5Rb#3ODhtq>ctdN<xF?k+dVybd^!Vw!C>s;-|!6Mo94H>
z%eFvSYK@HB-7Vd1I2LD1-ShCTS$fdorR8qjgfgI|Zw>%uTgRW1n`d-&cN+ZJ{QYgB
zHRapM`zff#rqBj)@{QLexQX>0r!F0%WoLT%&f}AK)FQqZExbL$2ClXC7*|fv6gq5(
zbb_k`HC?e5kJlruuzcGt>kz-ub+9h|P~z}gmyNA7JBw$SkJf5I-H6c!o`3At$zS_C
zpkcN^yn7%$P{q=0#6fIEMjjA)i*aS5*AeVrI{7}6qf@M#0`IWyZ!UdTW#Ok~9ihyD
z;}ibYJ)w4$K{r6ZuLRxDQd8_0en&Xp7d<hZC)gAfq+Xe5LcG!M9TPCV3Moq*4|*lF
zBd#<ty1FB!4kK73kq+07R{T%5l#iCs7~x?^o|6k8zfH2y*ql6%gijGJM=CxsU3zl|
zNo{FxmLThB{mwn4^l5RX2ugc9rblIkt13VL4aLJ>ZY9rholqU^T$NOQL7MQu?L*bh
z)?Q_jZ_UJ{^6!Q<YvZKBa`9KGx`Uc+jg+8EkM`aTdD~ugg2*x(MZ?_p+3XOO)D#(A
zO)e(Qv+=NJGt`ri0Xgc2__?PFCEWu-r+b_B!o?HcgVr-2apwbK#k7aV#M5=xtOc>n
z@YZrsO|Wn2K8r*rm*mJ2YW4r5Z$)TA#UdcsfppGG1{ljWyJi0H6qNhrc=g<;TdJj&
z<K@(_sYLSP+DFo1_jT%8I&Oy1jKa1>l5I`EUoqmD_#h0F8Y=m|A+>?2g;-YvmwJ^f
zC`q%IzRRnZuPt0$^#1z|iEud0ZV4HPY>lpl(>B*WyB3yW1U`V0Vm1L51(m|8_R;?u
zwk@fTvu?h?b=N1JzcuN?_^H17Z?Ne#f44wi#CfNI%W{8ILP9yz&qGq>Y;f`~6C>+O
z2+0|~@M}CMr|2gM-$J<KT{9xc<GlUe4PBc{Ux+BoFCxpviHGc(;7zVb0PK^xrvkJ+
zQQxopG57@VtgEZRiokUV@l7>W=c-2I3VKGw#V6-e@>A|bUy_IHmuyzY(n6$RO{(w8
zEws&;U_NE=yXSwVM5XA~E1xR=b8)S2PVSBhaH<Ov-Q0Q0JMlGsLw%M_;&K|{=+eP}
zyQ_7agrUad_{|!ny^(k(zM}V@>O#M3JusJh_`7rHa|G>3)>ja>{yot;?}|?*62+@N
z3i-8Eo0Y(z_g}tDi03j^eb)yG^R(%s73vX85N6%<v@9Wi{*Y>>w{awKdeya5I-(gd
zggzcF4cRWXentZU$%x&^hN)A+)IzhqzCO6~Z~b;n(7-tqj^wt%Cs$U^rGV>#-k^-$
z%!I7F{Ug`<srC59r{Gigx7iv=Isl+9AoH0HRw@>L$(IwTl<e!k^K6mqq=MfbuTEs9
z9gQ1osBs4dk!K2-#JPjz;p5iU)(}}&TG~&EgOih%jJ9yVya$aB3*Evz9hu2HfLr!@
zX3@Y80`HW{UuhmAHQXyAnrH*cR#i1+$q3{9q)K`)VZ&o|-R}@(!m2nw?WL4DZX3$?
zt$Y99(&n$?WbZI*<==#7yMkVFR@v#Aj4eG0jNB;_3&r;=5D%iGem5~aLIbDDiLp){
z@a%2&l(kN~StgU_PEM2-Dn-e@jVD{zn!pdYt9<$~+uMd#0O;F0Jo)m7fm#ocYB83x
z3jM+zzLu#uMRO_no~k(kSyVAhLF*97*)Uvu+2`D(H-=sjpU7nv>KE*UFp8Pr{u~kv
zq9bl;f$k2LFsXb2!meAZ+?sy_Iar=YzbetGCcR^+Oj$1Rt~yG~SB$=TEZTe<EzT3&
z%fs}=_dJ4{p8MDBnT&gJZ{6@kMbL=X7Cj+i+xR!3v36Sgw1Omn5NK@_ebxP=WW?-K
zYMmyWK-DE2uo8=_6m)KGQ$qh1I$Onu;Mew-<!aFEKNO&a#FJ*^V#BH;7q$A(*C7Wj
z?;Uf#AzM?vI+O(%7RJmZdY4pLNQ{ij{$@r%2je55H0O@18g}}d9_KMa!l)te2jofr
zYmW5cj#;peE-aNx*-_tC1LL?K>fqCPUl!H(c;nmWN@Eo*Wt9;PvZ>Wh0`2fiV?WV=
z^{TaLz%?~;Wy;_=DV_aSxj27Y>on9}F|&WIg#o}$;>UD)!}*{O{+u-;nwuu9+Et~J
zhIDPpm}XTzseLKy@n%W1n{TcvVc{I8oT=T(qUoH3=Vx->;g*<cI-TBZ&8I8{Sa!P~
z&E`iQM#jH^L9ZC{AqPW1xV3Y}A**WJ%7iaX;_>Q$?vL^`r{b#N@gT}6{R`9?1eZ+@
z*euAbtr?PF_f*&%w`lE59SUkSJu^of+{iIFo6A1TE!R9|HCcJ{Pc(mSm1qB}ud-!1
zW^O$H;q=M=ODjJ&iKI}Wkkx~MbTau~6P;W2jb~2%8+pvswVoY}=RaS08g9uqUU{y_
zS&)9feXJ6<s0mS<Ivg&sG@@-QI1`;iJ&wnloEP|a_;p(kl3>G_X!fBKDJX0=YiL&V
z`N{bGQh@P}-HDUJEtUg0I=CkEQ2?c}P5I!Xa^T^7FJO7acg`&BP5Mt!tMt*Fo3kzO
zrJ13INKGmKUiUvt%;@Hv4*>p-UDP;8MeUC@^f&JPeFdD6fB*zS{>QDrB3Eh`lbf`!
zaOl}2O3Byf)z7!@d(mAOw664!t8;TZT^7Wrg!2Ja8jO6xp-QJK-L1A$%g#0PBSB9d
zfW1p{_f;2i!3BF8R@uUr0r1hv^xs|A5F;RHm6=z#8zOI=m8CBdkMg5GE8R=sR(*PW
zFg#HMgc0k=Pus+z#JN%D`@(5s<%y{4umHcyu-7w^tA1)cckVcWl*|eckr_rmFd~*m
zK>fgh_~WIFzI^;M{vWM>o;yi9<Yh1<M~4>#bP;M$NfJt@{}w{Gyf0QuN=xNsutweq
zfQI)c>|H2uMr+AuO9;;$pQzxN|J;+7e%W01xkTrcFm-deDBdJb7v|=e!v!ePDUpiR
zo>s}$$AEq_Q=!QlX`AL3^4Q=V$r*2fxj6{FBjvbnA)VKiBKzeRWbeyT+sOVznIajb
z$|)n&#_)agFQaqsi(b!^rSIe>ju)N#cK`I$<=FJCtCQDNT-{-QwHyLb()HCW)htSt
z>MRFViq+h6{E04ScE^*WhG2Ml0DRow?CU4gcP3wI(40-w@>S!mYtR3^|0gKBmM$IR
zV4PZmh#g&6>7N`yH(nS<!yeFOBR|<-iIQ|iR0%9q^qF5EHh5j(%dC+zn_Z$kIpzYn
zi^g}6lf@MjMKSYxoSb>FdBpMInW=}xA<Ge6@E4HW*?=qpTVO9$vpnYYEqc2smDX1j
zJI@?`@0Q3Cj2a-AfboTn_8pIops8?;Un)1Y`a|EG9x&u@WOmg<Dk5Old8^l`OFPeD
zEfv|ASRm}J<)2>QH05GwAWzq!_iKyPo@aTFU*c1J@_hF!|I+o0yuzd&1mly>zlMG3
zPTiLc)i0m@CCB0&;~i~LgnPIkd9^Cln&V5o$*0>$TzRa;vLm%<;O2ExRl7$>E#a}`
z*6zuWn$^QRj250Emtrx|Tg@geC215>@o;ynaq0kwR=`E9)1rP=sq)zmvzgu6Pu+6C
zM|+#Rr_&=&S<Ygyn#V5ho{Lwt$s>NruwG3@8+Be)=|DfukS%elcWoHkufDd>0ylEK
zDQ2}CXr1`RF5|D_d_b2I(K8YPQ_O*D6Xk_d-4$9UdMaZ_HEV;jN?V6>>PBAG_L#kz
zSMvy?OYQ{6Z0yVIL`gIMrX(l#<B2M2@%y(C4^zIVQri;oV(pwS>B7U3FE9G$nTx!y
z&(y@D=xT3BMnR<|5z=p3M6l<FnVQ(6%IKE9I*HkMzLw>BPOfq+!5RykOk^LAOx>LN
zFHZ(yn+ln+@}MsH?6j3_P^L;3!!pNx_Qwnb;KpU1Y^N&>C#a_-nzhMKZC@TiJ3FZk
z?pkUEq~#gLfSnX#ApF0a6o4Ut1l7XfaY(}?`va~0PF`ifvtR#?R|6&{X2OkIWA)K_
z?pqz`1%TB)-PqU|d%VsB#%iF%y6L0jR&&^y|CsE;rCR6!r-w$#cfutW-in4$Jyo>A
z6fV;yqZGNV!2YjODqO?2rwGN5C`3@tNaiKtzYHqQS$wXV)rr3P-;T+z;f{3Z;VSr1
z=HH~Yv-`T*XR1Rng|06sj!`#%d=IFYaJ`kjHs{fE)f{7nrAs6*w552*6uO$@KetNa
zkq>B9?DJOriixery!D{%ovpri|M_T?4_8{PkB3i7IWz>qjIR4zYHI@Xd@m|vMy}`s
zY!7NP?KkG!h-b@9N?}J-TMmYMJb%G?*=&I&6+R=M41Y`~M#Y!=wBorznfO!QK}BVW
zKF*0=dM01D`1W@7$|m6|K3km!?U9TR>*yB<fa;&w9|fJLzLKWiZ1yU=Nax(^D24&w
z#Oc7yTU_e|FYVk}^vr8<Xc*my*8K$+{NI%f$2U@@92-h<3(gLg89)^ZZtv4~w)JNC
z`{>(rHW>V%%v?sL3fewFl%I&osB#?)hdR#z3d(Qe0KfKvh7yL^6hY(uq%>_^$RdG_
z)G4DYjgfRbrz(Cmuj^WMpOr*%Mg{*lPWRWoq-8xD;kv|nxWU~Q<wC?ty}_bSdHQ<t
zVtUSX@3h0VKiL<Gep{~K+q<{zPWMwG|6ZnVAI?zkQ(`oP$oU$?i{XR&Ua{UqU)ydx
zoM<km+)Hxti0xt|XWgO(okZcQ9S`f6Ull>Uf8d_PJWyNS&V9iMMR<!v-90-pqArBF
z%xUN)h_Dxhjkaz=4m4O^{yA(^pMk+)=8x$N3{V+A$7CnmSpAw-=YJ&E^;~UF%%@y!
z-}h<+PwmX+ocvi+8$XDEv54fW_d_i5bMlTCcuHQNB5;~SG}-cr<zW5K>f^?x8Y08c
zvah!Q@?o_{ZeqrOVqSny+8GusU*Dx0Hc4{M)~GfuZN?vHg3qX`fa3>Ks$&H4XX#0=
z!A!M!^-0Qaz7|s!Pls{0rjT<hQ`L0S+fy<aPFh;C=nh$rT;sid4E(id*Q~C56t?+u
zQ=>o)<Hs~br>}4t>MNKFxk_RmH9ZBATdG+<)Vp$ht}+c5e$272eDb}?C9Q12HF6rw
ztUoBHYdE#rS2yWbypQkEd1S$;B;&2+D*3?)DmC}UzAf>f*RNuNk>YXr?;9D3K#Fx~
z>*VC|M2-76RO|Z)c^f=(8(vSw=RwZjb)p?^7-QTv+Wz9t@u7cTUF-DK4gu;uvxTFJ
z*`X)B`c3?5k;$##VQ%EMdRXf;?g<BLg3O8uU2Cm0dm}EM1r2-r@>1s}%FJbBmYYvU
zDv2!FZ-{pxJWdqpF|M)N0EKeOcHkwA1scwLODFN%Px7>({xuVB{v+f$ueu`jz!5)E
z(0~l1>uX&?=+h@@^K*0m8JM0dRscfj=?NjkP5!Z~udx6x;AVEDbiMZ`9mb>j<fq21
zlG41j<mE99jq2FMd30)*kLBU!<`|j+D5tyWuf4NyDaB=#t{0zYFFgGZ4c3QIq7!<W
zYi_o@dPlcHiYT9tik0hh;@1AL{M#OHe2PbTi5<KVC_ZxkViSoL89@bnk6}mv4_#iN
zYg(RDTJ?4x|A!%G0PKei{I#ehnTWt2U`pt+E5ZNi_SYVr<ptJHf#E-L4q5UC$K*pp
zPk*{s2X={M>mw10{Q+r?cevGQlL^7XS4Yrv58W#=DlVWW&yU<z==kKkmO)wa8EC#Q
z79N=CJE&s%9@q6yyW@#2HRS5{>o2+!L^PmHYZ~)>z(M(vnIRU}nG-WEwaHKTS^jcx
z>}`}!cFGFRU9Zs{e5j4=;Cu~vy)0vLJ=CG;_*STT!HEv8xl|Mf`G;b0w)C4PUT1rr
z?K8&6_}(+`)r(H&ZGj?T3njWrqLg-qVP8P3aEIB$p0vY);r83UBV|OmN>RD!-KpnO
zN_Pz2Rq)+-^&z+M%-0We-3U4e72^%av|rQlGP(41Rb=u)e&;oDbrsP!@@y|Q_3)X5
zUEPJBa(5nnzin{+QDNJdmT+C)jAHE4jfK|Dqko%BDGPG?Bbe9<wTIHMqMLb|!s^W4
z(O(id?d`<BVIO_=CNl6UOzkwTS~7l^c$s&FoWQdD=0#2s3pKjYSx+k8$nLW|ky{<h
z=v_{fXz0=SA4OLl&xHHON5~n<og9m5a_5>PF(piydxkXk%q>T*$}wliZG_ynH1}09
z_fnYH56Q72Ns=P`p6|<_Ugn=Y&*%BPPqSiJ15Of`neOa^UY`q<+|Fcqn0&7%`L1Sv
z1a$LJ<{t;GP2cH8`KZ;ASK%lQej_QH-$T|A4}Qn{cL{uuR2u{&%@Jv8Cvr*0#zrN%
z{tU0qiAp24k#V+-B7@SH=EpiQ<|B$o!8oZ$tV#Y>k1>C7$KQjsfFswgK@c~u#oe)A
z&-9WIEDkcDm8hD{y}ef`un6a*0xW23iOYVr_kD+3JW2K49q}9Yz=%YxhMub*<=UFQ
z`G$d@^q_eb`gm_a=qy$=_A-a3TDP~YUOB$Aeq#+Tc8Wl#Za5JIJU^5Fek^lG9y~wZ
zSy~-<=$bU%-uOd<b20qjoDo;@@e}W!+08w$j>B%^zG=N{chUvo=wdc-`J?vBzlLI2
zY>mF|+mrAxL0w(FxaHUItu78E)+WHT$_ISGx4=Jx+yiNjA~pyTH(BHmqap^0{|$)k
zy}hVEej4Vcy9Fnz7EZd_jSM{WCMvWmy1ow*B$5N%Bda|wV>w+}`qH2IQ-v?Sft6tr
zf{wz3kG{|&zdVUFaL;=Y>NL8KuNkugei?=AM9DK@5Rf$u!qJCxs`5ZpbCN;j^S1gt
z-t?dcd9KPU1>BOJYG=oJl&F<6&3PJMA!xzYLv})@FmK@58O>W!BLxyg7bMOqdwh9e
zpddWCca1^x>B)k!p`wmqNUEx(&Uux~tQjwThGp-1Sm0y!>~kz;9h1s6XNc<|ZKwJT
z478&!Fl`W3Sp@fZ%r??(P=?dRj-B(+&mdB^T(Ybb#}AhdOyXmhH{aIoOV-wzcX)+g
zWO$>6tQOB*V`Cx)At#}IU&voWZLVurLC#2RvT&C8n|O?q@-qC;e(TIq<w97`{FaTm
zay+Z)$&tyw#QU{lv18>M4BKb-mK(~sod{8n1P89jp>WO$cc&p9irl)gT_}S_Yd!`M
zJ=6evQsuRfncwP&!I#=IkgWoLiqMAxX*y>gIj7^J@YkJ3K~(sYyrSag&@|uC2QG&0
zst#++Cnr8setj~=cu%KJFN=Sdc7r`n<>sqdAi>={@;ZK|&>I{Q5%O<oP`vl}wRYFT
ztMA_L$G+$E7sY;;iQ`F9b;EZccqKQIINZgCK8n)+YRab(Y=+w22tOkFzqP#Umx8hQ
zjc&KBN@UA{G?F7c7K|tN%W?boNxYp~vUk{|&`fGmBO+Dy%66#76PYhYDWnVa`fZFx
za#aF@126o^Xl#M1T0=k0&mnR>vFViQ^HYvSv*?$IU_67D%G^kCH<aJ+Vd+e`vW*?5
zcMN_zT9<{&U?T8HTh=*b(}5JlThE*!m``N>78Y|=@aPnk&TL^LzbvU>zyV>Uqt}RW
za7{g`waSjpy?3bJtKeA~psDE!Jf;_wZsO)a)#P8G?JImo7mxh+F%Ucw&+D=JL<Y_5
z9Y#b7>JpU1PE~{?P(Xtrzzdf`tJ(wb%lQ}R0qt)`|0{K4a2Fa3Nukwqz21mKfOlLw
z#zT%e*Rz#3vCa&>>+&GV41?n;{(8{jK9}*q=Ul{L{$kkid=Pz-fo-AZ;Q?re0yS_m
zcit%&XdoxQd+l62{`P$LaLuZBAB3&Y)%U#fTCR!R(c+}EcOG;;n7n>nyDUR@@{5`v
zL<M>GY+z<oHby6kz*+_T;yczC5if758;)zpv<TOr-^r1gW_cu8q&|<7t$<1EC<Kzc
z4dZceQTvp=gU-WxeEtLz!<W{FSfYz3^ghg!OVX4gojo>I%qCHE*<s--mK7mmRZp5q
zaGPe|SY4Hl{`BO*VOQ+O5v_5g?f0$kEkL%le}alJFDQ0Koe*MpHU=SR2d|bHRxNy%
znXo%b<g{s&soqrYUURaKE5_i4({i&}I9>~1j>T2M%Y`RwUF&X?^61h?b~HI!7QzC?
zgC#bz^60*B#lKgfbwoe?vR82Vov)>Du_D7u!r0C~zu>P2VvzcNiLo&Y1Zrebg`6>D
zE&U<DUIj6_jvU5tWLK-Q@iJVBzCd2-z5}aX+7H#k-MXxq@`ZR3LdmJ$AT~*AoF%3R
zGJg6vpuv&aOhNZ&CX>8ot*xPK`u+<!{uHv13sw3ec1*(r@sU(OYrml4(NO!gcVFR3
zb0Fos@Jo@WDXB+p2Y<Gn??H;rDp!9&3Gh(waq^myR{W;i%O)#rl^1fnM;4=d_9(4(
zwm5$KXrYmr(~nXHMwibpDzv4glo}}^7<F9gnGqA~8aHAEqutrAQL)JDz7~vEu6Q8E
zK5QoX8?7)VTI)Jpg2q;^gtgl<fw0N&qoo~nZgAfQJ$ShH;AD<}A<8oG%4_o|5ve=(
z>c*Q=@8&IM7>%rImU(oMkWYTf2kdz-4!CQcd%7K}p>5J+BCVj!?9!xl17fZI&0=)R
z(=9Jz=67c~Ej9v+xvP<;@_mVX*yo|&Nt}9<i5o}N_%K-JX=bOlWk@4Zc#>_zm;@Nz
z<&4%RJAKfXf9>5CwpH&8Kz(3uNq~O}yz_VO*v2yTE+U{$cPc)?`?AOz9o!dUPx(k~
z3T4CLlApo8vOWDxOz>p5ifI#{g%ku~8dycKZ|iZeXU@unXDXAw$%VN3kjoc)fv_<Z
z^epVv<89aSP+qj(caDGE=>?r?k$>VMlav_c4|EX0nhrq93+n0VLjEIV%2#Z$SYprd
zQKxSk8rxaVpXw{h9oIj)G+^yYxCz0H)o%rLg59%%yQI(^Nn1D28sEu)vajDq*1hcQ
zY^73BgTB=y9-jA~o`AF$Zo8R(pWj7n`yAh66_)z*;~9{C<Uc=TApF~PINl8cZEQc*
zUWeRq`h9Q>`ZfmuHtQ~(tLQuQNtNlA&8Zi~EB}j*O0-1gm7BKS%$(G4o%zIFNN=~a
z+^!*kK^|bY_2lFu)MaQU#t7E$bFF}a2QN6(h$c*1TbqQ(CFx1wZ)Y9{QBDx{2`2y5
zvf!$~0^)h*@tnF4&WO@F<^Xv%=~<h*)=tgyuPUqDc-m-<%NQN2GM@Zj3vyD;H-+@l
z;|=OIjQ&!>qAWwXA7eP%M-_T8(#m&UBA8C{prVW%H!A}k^=xS3>VoRoZY0xHv3IT@
z%=3z;E&T^NdbT>e;%_>=r(1~Eg2;CYWe%OqdCK`#t*T}OuOy+aeE(DKfXzFyAJGWb
zL|$I{RIs<;P@k<a$gj<dP@%^1)b3ht+QtLeL<M%@Eebab6|mei2YcaXqDAYMjP!_K
z`wZ@Fj}|!WX1>-pv1Xgg2u>waP?lz4sN!Dq?^UAA6&qX(&RGQ!-C<H_IJ|tXjv*~Q
zv2h9|^nUYy4_fu#q6~*g38f(>FgRW9Fl#r=<?ilP_t>m;5eDmE-kymPQ3=GjHH>sA
zq$(lj4*Uw^H}!B@i_5GKjJ?xi_k$j+qW132<KMokcbNP<7&@xACF#0lC>{QWmp+E@
zZ0TB4=KjiqsYH50<!c2IJ&&J8rK64#ef-g*AFfGcpvdk{RaJ?2x1Rm|%Zb#g=dnr;
zw&kIQ?TjwiaIu|0A?r^*=h*MFh?IXjkh69Q;C$QXzXUIvTn*K<D_=6l-GpKT7=+e;
zZmx77&WpU156FMexuqV|GyicSH82`MPFE^wo+&Bixk2i;p3~*K!JAxb=hBRBUqQ3a
zc%-r>tCD##rKkPewJhZgorM}l6utcUYePyoOCH2Ran&PuYWgmBegDiw^x<OAGqirH
z&+>5awp#Q+aZtgW7g}FH@u$1{TW4VxUPlj$^AlyEDiHos*`eYrC!QsXV1V3vBun3!
z{R2Ory7<YZ^PE|G?+jbf3aX`3PLiwNrw7Bwa0h0;Ubcd{+$)+gGpbrvX{$i(m|)WF
z!ZeJ1tFwr36Rkk4Ya~Zck?`_m8~g9Tm_7V2%9C=?H>Tu_t1~cx#S_SgGJrIo&+5AU
zdIG3VB?{SE_XYiu#Ma*P?#N^9GV}JZ?WLWDo}+#4qfgxOGusNsKNTVmHx>iJd*>Gh
z_#DU4qOk#*&oQjx+Q(YtV66hfs?a#u)LMh24zE>lu>~)+0(rFX6Y?jG_5Er|<qmN!
ziOzZZIMdxeIp+k}465P3{gQ!eQ(DDH=XMS@G&EF+%BsgNTz}&9x==$VA~}!Gb!Vaf
z<=_*ga!n+O4_6SBB217WTrSJfx*S#!sxPj~dLx|9BSE(IkPqa5f}-q{xg+<Chx)Rl
za$>;g4?oM}UhegjV*KQ#73c-^Z+)b(2t;mm{l=7ys;jC3Bv8MJnvrKmx~Ez`3!WT6
z>S#?z8MA(Q?7zS@BU!EiL@3}YQ_oLUy~*U^%<#TZhfZv|P^Pk4Mg3fR9y@jS1)HN2
z1uj@QxcvU1IudM%pg+N`b>6eHMb$n$Zun|kW2~<P8pj|-XOGfsrlR#V=x;pwnk>zD
zC#Kz@wfs3aP**?S<4)sYoT=XapG<we$NSTTMrz<y9j_{Ijom<Q)@GyYY$pBu%Lm0`
z?bPbrc!Ru}l3`WE6bW?X5zFIaE(_A_z<wTwN>EaxYbA#`S}+{{8~?g^__2E7?o*L_
zH>Hn!UTayb%MwJ&EduM<c_0z*|HTePMO8nr;Y9J1QuUc^So!oy*Dj>5PF{XNaqYWn
zbi2(5N)DCgXy=Qy@^|IEp{qL6oy_iJ@b!*!L%gvwVCuLBr5z!#U<bpdY4EaJ%3BL*
zp8pupdF!p^-UIz19!X3?_6G4(#nZ*Z@3b97_S^&}Cw4ujkMkDbw@_Zir!%8p1FoG@
z@E_5yn`XFu_X#scvln_N9gd{NmD3*NcRpxtuD`L7YJhaQaVwF=)@5)owRMgstI@E{
z7YQkHH-%@ThT5jZKcp9<Rj>VhrrfMsbBy~_oDy8~=<~}_Qqnze$X@YSmEJKu<%m)2
zJ9{~>=6<i$`D{(rH|U!&yEV;x`iOSNSZFCU6ah_Vz94`&!Duwb$k141snBI7j#^5B
zH0n50GR<U~$DO!1`-kpuZ!H{r@)(En+s#RrfPHTBZ<rdHipCl$yi)*LVGO{{ovEuM
zJ^vw*&Gnn*A(~>KwncsKpw)XpbBnog>p$ZwT8PyS*11{#HI(EsR2IAi=4>2cQYAZ`
zH*H9x;x|KvKLg}#o<BH^^qt(KS$q(Q>MKOvo>K!XHSgz!$io_6k;v_e$fMQl>mfi@
zJlgwj1JD;U?dDSc+RxhsGG!`7Xa_uD1Q=)pgm!dxZp>Ns^1LK&`Liwo8xYtO*}qk8
zqv`_lE|z?6ez$OyZe|Dw;p`2ICn;W9*RPyao>Vz^YLM8h#MiZFcZ*zU)7gBq>QhTv
z8-Z5#^u3N7ndUj2D{EtO2AO?#I?&{6c|RwzHP8R#R#%S;*Yg0Xuu=Ye&(>AQrLs~X
zUYv<@+bO3@tL8Wxw?JGr(ps-0BnOqAk!oy86U3^~@JaqKQr3EbVCqDrw}Si_S6UDS
z-fB%^^mtw`;gkT+zCXm{NNe#1$u>JbE|jYv&E#hv-pMkQi_3mOdeBR&VQ0B<-x#K^
zGf{V|n-m!HNfX0c#m1GkB-3F~!bSl(WM`n-T3VN^tC1ZtUzsIVIpGSf(pMnn^`W?I
zj8kAv=5*g1^t0yoA&E9gO8Cy(u>XP)P9VZHO)W|TVT0~@Qy<L~(nw?p27+dM<yHwR
zPx_7+N}lAVrYo1p%5_%Pqoc@{XFrteTTAlT=RU0_`3h)E`S02ej9~a*=<4VpSdO_P
zosPYaot1|5tRMG4*i_bx=EJVTEd#IJU}Eiiczw{Oq>`zLTAdehZ5#3W8pK*x))D{3
zD*pV3&~irqz?CFbJWX@MLyq*a$ip^Ra@ErFwx#4#)1|m(N~kgm;=Z9sR7JzBoMh!(
zkAIR$7zuw^yI&KYmLAC2$7x11gC+S0P44>T-VYPOChcn4nN>w)RJXa<`jnGPd>QZY
z8uIHfsyx2rHubG#=sqRasGiZ87bYRR5q4${**bgBa!Wa?G>=(IFOi+u(SoO^n#PbC
zpqfN|>RsI0<{=s;h2yHmSB(zv^4AMHb~{0Eyt-j>cAASym!Nxhs^{=Wr)_pc8Y-9H
ze0hUsHcs5!oK|=yv>@ipA76X<(ItuKA<eJvFDu7yaQ_qYjF)2M(9j`dX8##?7AvK-
z`qyPy>-$v=Q!_YF#$h3}P1&?oib}*AxL}G>^0JQ8XHk8r=uhEa_K%(f?s0QV`~=m(
zYGi985=9lc^Lw5al1KK<S!=Q!mL3EwqLJaS2#yssyzp?N_Whq5h`=RINe%j3i^Z+n
z3{W>wh8Z!4hL{`XR=`$Qi1YZ?J?<bBqRh9mbCWKv-HTD^SNu2d{P1tSx_9L6)8jwK
zhoik(h4pqb{~^6tIc8o?6l>3-C!jg&0^6w-6&2`>&Y-Op8h&QiLo^ir+z19}RZv6x
ztY<4gypme&cNPJ(%iuhMpAJ7JNIReXThCM@xppD!+F#96&zj2Jb$e%l#-E|E!bWa4
ziKIl|(8vM@mx_i=RTtHTQfvmlxflLo{WL;p|8Km1|Nb4csDYWRG{Xgulb!mNbpQHC
z%u*m+`75veDZZ7E6m1Qh=SLNYMA6x3i@F#-u_PF@NNfTjWd8GBa{XO$29cC3QaM^u
z@na$Grk(uq=7I`2VI?Dxa4GX6h{4$<QFs2P`=MslS*Oq`aeDG0&0oP}Y%0%slgSUM
z#X#OdDU)MRBx@e8R?4K7NqFzBq#KH-Cg5@f{3dR!zdyg#d-3b#u1CT#qgqG~`|;ZI
zCRH)hS9JcI(TQ=k350S*GNhq4PP5T#1-=Qr*fVh#!{4Kah*UILUeF_a+&k-vuYy;+
zl`YGV!CbyrPZioa3*}HsnM^Vb=e?#o&TMQvA{B3t1R})I*!&hS&MCBf0o$Bm>oiVC
z_zWF%o598nzH}e-{`1Z|-f$?m0f{&NlfXEd!2hVRGSP5V*MQ|wrdPnJ0DFWv1Mle&
zdO(eBn|vc5t7IlBq&>jsTduzVB%pixD!sr;@or$X5*;OTA_GNQmV7IdNKgH3L4|k!
zj7?H<DlmJie}((T$4aAT#SzV4J6hyN`*cnjD6V=wG8!YYzpKK4b*b?Fhbf@pewy86
z2^Xc95#v3XA0D^d(HG@_o=HPnNTls9wB2ntx9~BiZg+XmUQC{ZqUm`TUKQ-#*cG|F
zOcJC1k(r-}obh0UwtlJ$w8AmWM76JEnE7c7-6Eqmx^F9;mrR8$3fPg3p?xIhdjKBF
z=GS+0tg5NJ`|BWd<A&Ftr5byS-XNT0w0^uKjtx_@eR`6!^>WiM1zfF6ZHYvO%`Juf
zx2sJ)$woT7P$^5A;`@egXsF;v0jn=XmY44OS1GwNebJr4tF^E_RvGlua#$AKVQkDn
z$Hau>8LesnUEUrr#^vtRFM9SIeDBQeFJ4JEV@%6fPA3q4{P~G7x+eB?StX9gql)!D
zCmhEU&6;ZyBqr!>ruX{f#+a7t(i@`D@V-q3N~e@-X=!}kt94hKa|TX-xv75q?dfrF
z>!yRf{m~EC>wA*_29JNK_x@W|h;#h-`=PE`^NDEkL7ckx_un8v)T?zdbc?}M2PH!?
z7Z=&O2B~KG5F$8tFPS;@srYq(4G#)+@8M@c1o@UcJ0G;A#ATaNgmr-M;RJ7$@P^8%
zGIRbp`~&bYINiHTt6JkQ_reY&m@QailDS^_mZo)OwhV&&u?1h(JU77I|E242zosE#
ze;im|M|%&ycZxrt9R+Juo&T}xr$h~~6MU@awEN<UQ*x71nQu;o21h^CKQ2bWo{Z3Q
zGWM}AKy;K}d;YG&7_679oTj9#8$&|$p_(rHJHm9F9HtDXo{wA%l)HkPs}ox#+)r}n
zVr!OY`_zf#W_qWfu;6~-B2V96@C-#xUXr!AN*Pkp-#T|%!!?PAkddBgP>pG&)dgv1
z12KsI%${Laai7*5n_?PqSyaEsJ#XHN0|Ez7S#?C)%*M1PmTN$xB4>h!AW;-3U!5G)
z6xCo?(kS%grkNGz8fI`?WbS6sspY;gs;Tia#i+)J9!f}G5`n#j-DsGX!NP3BC|y|y
zH=OE-<+3o6#^LoKCzzq~`+9QE)%)re10bjA*0^yq)3o<oAu`fW`*S{gtMw&oyzoM}
z@t8mR7s@g}hl4;&c@hzjFA8ecA+bG+1Oxd@o`RhO6r*KXU_g|v9wk2`drWONXQp{v
z#3TP6Sj&nClF;bdilb29I;D<pNn;*`jf^LlY91*-h2Cs9cYzW~K?IY)tH+3*UGC@I
zY@qh6&47WaPsp+9>C@|DWhNGF8T5WQKWMX?@ikBqr(`c_W{gZ~Wy+BM%pYFGYQcm2
zZ`kWT7!sk^d0Pur^SeZ$P%;*VOUwe+r|$hM-bK3B#_>zrew6y>#9~w2V^w<hI^)!B
zy~7orYRi)#<|V^J{!7<_9~J(L5U&%k#BDX_DVc5FF4fw{&tn}FKjZO=)0*^B=`Dxq
z?<)4ly4(Bsw7~DW*WhUeUZ;mhZ)H&tZ=^r_RoQcFx0>M40^TfE8#roQ@Ht=_N@ei5
z{b_1Q6JwtMoqZKN+oVdi!Estqs!EAuU%+oTCGAO;^U+x$PMkBEoY;iz>&KgmEp}Fh
z_&n-dBEfoNEhID|q6Z@n(%<lNsG0{o`N-p=LswYY&ev~5f2w`l#$Rh@L^idKwEl)C
z+>>Tmz||E5N<FEX7J+-nixLJ!!d3_}eD8<L0f2IYx76}CzXl&U9Q7+i?$8{bz2+A0
z`S|KAJ&ZLu6oXf@<xd00ANj1LZ%4EsP-!)!P$7F%1sQs+PL<rJ$l0$bgay+XYUn5p
z$*BPFuCGuRxA%h9w+CQ=8&>#H$c-;m!Asg*Ae)KF+^i>kROQ_pi)mwh!#ayIP61X8
z)au_>NE(wkIm9FHcg6L?eeQ_g<3Ol;cs+7|I=lDacMxbgWMgBahy14W(t026tR+LZ
z`<Wq0s?@d~8Ya$B<RX?pbl0t6iB%%Q$s*IDL(_nAB)0<eZvm9JPyLkR6)y4Ml&DTj
zP;#o(ck#BPJs?r76Uyh5fe^QtjZg%G#k!>th4c{GCj+-9;xCeC9}4P9JVD&l*<+6;
z7*6-SBbt4%ea$0Rze%>nnhT-qHaq0Y%neguGIAB_`K=~(=BNblJjc^Qw{kWVF?4t6
z2MEK9jwBR^^16-18tU>@SCXp=?(_Y+=}+m<aS|iJ^g`rpyp&6d5*iCmdg0laI*&VK
zUt&3%Db_IH>WJbMQ|2hN2l0jUp+vAv;GL?|xwcWD1nywdrG_D4QbTitpqLGnt-jF^
z6G(PWfk`|1E}Yf#PY6qk+~NAy8)<aeDY|1h3Nd%B<}pQfTwcceB!R@&lEv4Beyb6U
zcoSIAn0@XgDf1R2fWj*or9tmh`>6kiQgWEf3Q3oAtyIS9qDW~5lPVne7L^~*HDXyg
z`+eH&%iTW!-ZbsC$ed>@@`doIOL0(^3byI^M?Uagfrn4>WI%gT`_zU?1qv6VM3yO{
z!XV@jdYBL8?8_RmcUoUsAT?xJ<iz9Y6EmEqBk4;FHIgi8O_F~m+9o7~yOW|CJYvr3
zlf>R^3?BpdtS^*$p6zFe-K}~|PU6pKN(fyqr6&v1&RCsLU>&^LZ(S1&jl=y7k-J^9
zast8Rzzlz$>Z<PEv%j}p(VGIradq%_sJ`0$&5RC@7K}n`OYUhG&wBmkL_c9LWrfrf
zJ%(*4@Xi+++B?SWqN)$)I$|ub#>*OFrvP(z*=iGxO?v6}BWzMl{#%BZFGXfoX3QY?
z962NfQF-HQ*~*Bj$~dqJ{aAW}wmgevSP9S^aHrmRwSkt(D<6K?Ivaqb5r$Om6mS1$
z5iMU_K@p56RTyiTPcsbZ%)|w4Evg}}ASZ#}Wv_EF-0Jf2fWyCm{8nyiyOyAyOCf#d
zFUk;1-o?n{!<|u}!&(fa!s+3(Kmh)9VdB6<OlTabM^#H(ZT-)y)d+T_`9c*=B6zYz
z@(&6}X<z)M&2;!BzH;Mg%s^fXn`HC_!cd@4x$l&NN~}lSjN`-wSjA%ke&&17Lxk4z
zcR+l@kq!O<Zs^<I!GGy4{c+sKAawY8*e*DOhnwC;a^W*)(#Y9B5DdC9cgo#Cu!YLa
z=`P+}-<<9}yTx3$96_4|Rw1m;+mj8qixFfrjZ_eN-mYP0bk(CkUrL-&B8!NV6YDCu
zFy^UFe(2zli=A3zjz%YhHhVZZ%MXC6E^~%LmjH&S#<%I#Y-brDl?kQr-bQ8FH$LZ2
z{M3x8=?R<kW<ppwbnk^P(uw}VBgYT7cqbl4w%b>P$duX3XZJ5n1a4}ilsS#>;Ao({
zH08yJG2~Nd_oZU+D$T5se5@*vwTpMHGxalPA_f+3J2kH8j^2NoE|m9XEH+8<g^gvR
zQ;Kk@8ghd;aDm5^a@*Sp%8qQLCO}wE=sK^Bl$qO@pGdquiHsICGat*15&RlgNAhu2
zi``#vkGX)VY0pFLHy=zc62A);=g7YQRn(U%sluXP_K`r)aU=M_l*&HcJ}YP^)*Aa(
zb|7=cZdd<wBc&F4j}tsn0{v_o?A{fjal`tlS*qNQfl21*&CGRI2uCy`HriPzE+j8u
z4kH1{L*r;c!2GO_*nNhvQQ>`rPCWd*ydA7{XD2}0fu57jE+(9jZF(RLv9>Y96;xzi
z=uYNwN(xN&<`6oMDYewOQehn!hn|i^IN>KBAUm6=&<`@heYS8W9!z`r%&qL1Dm~l$
zOc~oFKT}R>^^vzzE%Q%Fx<+=o^@+T)1lil(>OLsvf~e=pmZQJ7@gEs^?Cj05%VW6A
z=DzJ_|Kb%A;J>;)QKl(jKg9O$)lpr-(BkFEjYM?ey<Hw_$gNci-rJBQiI);`&WCDs
z4Fh+m%4{57?2I6`>p$m1@2u6@Qm8Z*tY`Tb{)gh%`Skbi->@Eu`)70)gFOCMa;;Xi
zCF4UN=mYas6>VSnOzbQ>N7>2occ)n<Uw|~a=Y9H5&<=jfu<fH^Jn2-oKi(Wcg<oh^
zov$B_v7iLC6ZvTRQLS5I+F^gf@-C3Z|NOk@5Zj_BwYVQbjM$?AX~wmUd`f%mI}W^{
zjMe!I6n~=Y#x+)xozjI^iYDnVsG(r2KK~AmTsAipNbA|^-hsF!&w7Co`k<gA>AS}H
z$0nYyPcj)x+fnG*{Tz%QmV`jdS;LwOHv{2={=@&3P@L2~4O<3@iih)uaSA=b5n&5n
zEnwpunB2ims#RY&2@wy2k#JXewa&d8oOA~OG49Ocol6qbSr$EK?dS5KyHDd%G+Bu8
zk@br;jK&P%$MP~jKtQB5KxwEpm&eE<(1b}*dS$s#UYrv0{Apo5%4|8CcCcga+U*^M
zSH)7gF7LyLBhv>HEp-WN$^NlP+^=mOcAwfICMmr>si#J-PDIXmyE_$qsQeG|JZqUN
zotVIHQIWz*n`e1M_I*WpV0YC$_2dq?x`HAjd*{76>(4`R1xn_^fvXguE1b)_q2sWi
z@LnejEcATI?-3*RANPeMJ7S+Q4gR(E=%#trHj8FGX|v>ISY6=t^?g2~-&P&ZnT2V*
zq^al9G*CklQMPN9JmH_T40>&nn<%sCJ9ojkxahYbd*z`Z-;VV&x`C7}JxK|0uEzXo
zPCR$9b!{2W*InT(D^y|JmI5?$KuD+Wc>Vw`osBYqsUmT;t0OWn*|N&=I%Rssuh%HU
znvWj2L|wPqz`PR;)=JfLsbwo#gVr*Mve6*N$TgdCF8lAgS~44O`i0)OmrI#i>lMM}
z*TYDx1d+dzP!0o_wCh2>ccdtp7oxCi?y=gD#1sVy{X7b_e@RXAC2LAGcgWQJ<Eo1H
zJxRSGhq-d)3|x<;Eg^w**m~yGp<2K>5sp{H#;DQy9BmN~msxa&Ha|~E@j#-hjhQu*
z(dc~!!W46xI@!_-<z{3;a$1t<BEVjWeW9|-`mtdyic0mBc<26ohB^t6;2ia8PC?DF
z)NVJalh%=XO~!%GM-;U!TxEBYyt%nK?+Fl1vy7Tk?vFC_AnqsM)RBau+>b_b<fi%T
z|2Cg(Cm21yLC`Q+-u7uBcRf4?K7I(RFgAI!gQ9sb>-zlK_3POv-DO(6G?pG>kMMK)
zwebC*hx+s1FVEW9t-WZO;N{7$vW!%XA36TBUg|7atoN`<J4b-U7+iMGE8kG0=5_Dj
z&wK{1_iN3N7jHv)PU#uKq9vppBL4h)7rA}W;K$}x1V<G0kE19MzMg*%$V-=$uU}Vx
z1O(6i&Y!=D<9uNE5CAYg)_OAhdN4#%RIp}Yh5JqHl%>v#_%0=cTG^BeHLeJHz=Oug
z33p)X*P~i{X&a|oL$}s1>Ad>$GLFY@m}Jgi>T~;|BGZ%v$m-KTO4Hp7^_^0Di8q;~
zuqq9<w1O7*I#T3bP^9|1X4OL0sc11CLgk9GnF@V$H58;Q`4hR)xL>1N=K=qKr_9ds
z&`U2P$8^Y-@_RQPh{T0!KYvGmM$n)SPk3~3%%MD6f-oD3DP!NdX9#pu!iQkmxBz7C
z!}|Bv$xKu^ahzO<Vl(%D#hO=xy6xj9fZgks4*g$*H`w=*lw5xm#@~G#71hL&mRhs+
zUI@3}!%++?i_)pmCw(oWE@@Wom<<Qcp0%Z%G<BHq72P{HbZN%t-OLt;3>-v=k|)2G
zwrd#-VG4j)JO^qephspJ*PTqZgEq^r-LIR~?A$duvnFH3pGNgClBdFB&SZX3k7X*h
zd|-kyoK&esIbN$xX6Fd>@4L%fgleNE=}^wf6N@2l>zkv?GxDV3Ce)A@p;QqpX7ZK#
zk{E<lci3i%l^w!xRQY#lYwg|cvQjL>4asIpIn!KdvTnfz+0mkl)+tp3f|I!OSu{~f
zJyS<cB)@*LWzT+=isoQc%#)dx$U@r`Wf-!8d~8nUfXs!D16raCSwz_<hQc0VKfzcx
z|8Q5cKJ*owKBXkeTFidsNj=J9N#-|gE8-G~acIQgz3!~6-j_|t1&qhXi+?XPioPcx
z$XD->t$Egns3bQ|krSReG7=$+ziHvUvwtT}3qP7E<tymAYuT@3!x?YY#U*g=QmPWm
zumo>d`ZDRW9|UD1BO$UJ=B8%=AJ1^HV`6BBX@tO}8u-pa>-htice8Rs&h`74zVB5o
zN&HeMn9XH>(P`SH_TCpmr_brUztOoCc}+j@FIg35^chXfe>@Yce8XR0%Vv6A>E@=I
z6B`ZFDae0%eySb;`2d)dgSZTU!*|XDO0OQTm$bdQTJ-GogHA5w&Mf`;YDO9>FWq!N
z5U5eFiE@W6Wqbz7TZdj-*Y`3(rm0NC8A!b}oiFU8XDAL|h?QRr1K&qi=1GgFnU(_X
zs|X*evcIvf&;`XmKrTM6etqXPJ@26Y;lTqs=i#4zZlIoVu-(`@=vMU-y~XfvF!Imz
zs^1Y9$aR6r{?6`QkvnTZ)dkYFFQ`NmUfk%C?p)jwqYY;F^TrRQ`u9-tP)SO*xLSX5
z9=aw<R?@btdhb_B_J+DgU8&wpN^Yv|nKtTbA{zt}J9c;&KG`CJHfx9b12w4o+`+*^
z+~%$yQmMK|Oi~1!2|oa+%<+==@oRX6F$$)SyEs#gJ(&iJsqhtx-B9F2V?XoVu;#k@
zndlYr#3QuSb`<)pdq>;)>ehp+s=3n8r}Y_ms1Ju{^rl+sOp8lNc8izqUr9Otgy)0e
zDS}YvmnW@cHRKsxp?WJ`ohO`cNQ|HQfu2|0yS|^4Ardz6>`Q1jXa7WC-J0hL@lM4~
zM6at$7Q0)wI=-`sTuSsCDWhs8!(F>aSxqah(LIwuYi+te;7Dt)Rgkh+3w9-VupaqT
zxRs&lGIn+IenRvVI5H&{8%QZJ=Wju_7kIy#H?F8>rZX3<)e2xJaC<5DvWz#GW`?rg
zh<lWpMD{Bu?wQ=Sadde=!NntCPnRtm`*^gP$96a&9YWf7{72fU9tG0MsURVoY3qr6
zgz)C`TGF~EOB7i)8k6Kpj776Qw#MqQGXA*uYgr*~k3;y*y#R#3&0Q)#!%as4fq<#K
z?x+>w6IWcIkyFShWsb;ky2?JvCh5k6(;1A~r6fs)%CXGLj3v@W<O3Zilv?`_MH5I#
zqG&#{!TW{){syv2;xkZBs*L-~mJx~?&8svue!gUb>?_~1%M#>usUm%ZtvVOk(jsg=
z7Bd&4b7as`Kgb$VAAUlM0-1PUTc3=Lmh+QiC2KBs1>Ud@Nadb)Y3}syth4#1_9Zr}
zn*#qi+<R%M*-Py5u0G53A7fvP4V%jQ^~NJ5Dr1Q^TJR!vu*6ZT^<~%N_SPj4UV#Z7
zDr}t`IG^5sk6G;oggQrRyCWx2F!WO@;mN{1J(St@fWaB>W5|Xv%LZqRy3EFNos@=G
zSv@)!!+%^c7;)sR_Mxk?6PFN{b;4#VOenTVSsjlnAW^%Izcst(r3_B)?D(66M7DyG
z1#TqGd@XNuX%$fWH%d1;P>EuZtUlcsHT=Q%O<E4n;xgUO`(#)}c<gpxR6psb&cR{5
z@^x)(u~Ttq8_;{kL<g&y#Y-`rHTd@jEnnw44^E0}m4qGrJ&22o8~qzX*?B4+85W`a
zE$9F}do{8hQ&`ANvSd(1*v;6`16m5PQX)x8QOd|=t?{|CS9XiW*reOMZa6Mslt(pN
zvRe8=h|XOsGY-GFC!+Wo!~0rDw?uO=14U~AI;dsEEY#H1>t#%=$Vr~hyMAByd$t}5
z<W|(GZRSV3u&g7v@uW?laYMhu#j*dk2gSWbKPYlm>{P9Mi2l@dG8$t@tjJJUzt%MC
zg_BzdasJuSbE|J7FgPS6-q(Jnur#CA;XP_(bd-a4v!(MvuU@IfyJeMCE#zc01Ub1v
z9Q6I4r&?pnll`*Y*B~y;pL>t{;IPMBEUtL7qugCR{CYBW$xt&)>GJX=Mw0oKCrwQ@
z-R!=L9nS*__`ICYtA2&$?;kd#ok1(UN&^z4X_asW^&DPPqsQcojFM<O--Q}yW7sqt
z0t}1;HQHIiX)-&jemCFrfIVW&UrLW+gQVwF2<y}aE?i~yG#i19UzX-_QjB`H!G=pt
zhEY+&sXd%_!KZm}_>o-P`?HptI)N5>*{wS4aHwvbR?>Oay~7q<XH7R{(iq2=D2ZZu
zfjdhIwsijL()|9d_?kzObNzqG!>RYB_>wmz7`O0-KX-l{E)H@-LJMU@d*uo1<r~){
zBl+Duc<Hsx@)blBRJm*=$Avi%`*}dja_6F;q46GLCgS{7*dxq(Qlb|Aw!>x8RP~>X
zFX>i4UO#+}O4u2LTBy%?yml^Am_Z36oOmNH_Y6qnWzDv=ywqf>VsLeSgs!n`s&{5e
z&)i1;P=wy)7R42CP~pnI*z4+qi;6NNAgK&wbTUN;Qw^1%{Pk+yYCV{E;MC>8s&Ba=
zwaMWn&FTG-rWpW{$$D%s@`zV*k@Xse4Qj*Z<RmJxp{q=>-Dobjyv%r|XJ2IGas76O
z=Y=AdY;{eKrB<qD0&8mmDdQu_>F1lvFSBgzv$_R@L{^^7UqNb|aITor@Ob}v8Baw&
zGQ(fjN3`}rqAgudA4*2kR$r4wmg{mPqY8P+_h`Gc`R{)NcEJ%SuzU7K|1Ekn^J&Ek
zs1*Kx#*;&`3qaI)HT+5D2FPAHK5E7<asdm5rVt1N@=g-UM}0VG-_X<r>70t5`#pHT
zWEjCfm3aM@x+D`Xto;(3W1T(^Z#Bkj#Ay@kOye!bYZVUOYKMzQn~TA5b*;UB$F~3j
z5|^F*f|P<|)@6A`6$bh^RYX*de_qWSjkPfC3YDkJ8sHYz*8yC&zvlCD-}Qa@>~EMx
zYRo<JaT$r&q#b$C1B)GT@0?ddaxhM6JT`C>VUl=RmKBm`!ES2Ue#*+#&JdRUzrFlV
z;j43|TSPF<N%Cs?$VoTnyyc{551w02e09@XKC7YK_O4vfDIEizc`Z%y5_x=AAS!k=
zH92T0F$SJ(%-cEVR|nI-c60jgeXz++f-j&(EKf&$qSE@`tDJiO(QSc$XV6wSq)i=4
ziqg#O*g2anjg)D_G#~*}?KiB5L{0)YM`>p};6q_K)sS4u*I-3}F+PUa;*4%4-~TU!
z#Jbv_a@E0b-b-&XdIWe(){WU1GjYQMnzW#E3Ad%`sJe(JIEzcS2TqOMuk~F>^f6)M
z)#ZU`h;k>KPF~JIIjvoYMvaKtM8{7YC=Oc3If~w*htKSdJr>B-zvkZ^<nhXS^t27e
z&vya7#pG1|Oe~&teOT(|_~4Dy3C%cCbz-D(JTs9b$>cgON(b&5C3?yCyvHB^kcFCO
zJ*_n=Fzx8ZeX;I6S>==`q?Qd$RkwQ5=|@VjHmHQY=^EUy#+u5RG>K4pl7G%!M4P$J
zz?0~qFYM)oh_NKMgbyx+vK66XYPvrg^8A<?vD>?r#4kX*c&#_UdlY@kAW^z2u;a{H
zH_LpJroASPKa@ENh3h!gaXNw3qde^D7i;WS{i=*zg_w|S)7nsFfBvOaJx2!JmmDc!
zPsmiS!l<m~!n-{2wL1$My_XbzT=+a8^1??#?qJ?6iM!`30y4yEG)c;z4|mgiDZu8;
zFyS@LuATPj^>z68^<0D0q#Kx@*Z``IwfhYO#7TbL%)KtCXItA?gcKE9CO=lduAHUb
zXrN)KK%!Zs-Z8BH#D&I8U~R6O*k~wK75UG~%uXB!g&+FmQ+_>7V6jm612gU%v&6OA
zy45jXwEMi|!106e%>NL?1}h_hVbOF+)h21&Tzw>6MCV(FC1Onj;EsF9NrJtNAD(tq
z+%JKgGhfdwDTIyP_^}NhhP#^phuYf8&E*`BjQF>`Jb$RIA~_HYcR-m)3+k=;^>m4X
zt`q9@E<jcsfq635a`0oR|3T+fu{*%t;wY-`lvmz)(Cz;_^c?pf)_8lI)*AWqV&uPy
z-aq<DlMcrRy$L)21S5~$Mf_U=fAB;}-j!u(+@-m9@x<p<%`6nBu=qc%chDcdHv!eZ
zuwJ+|`1JZ02`@{%%uoI0i@);6RVag#u*}Z|TL(Rw7Z90+?|j87Oq+OBS)9^#{)#>?
zL?sow)5-%E286`LuM148bDYJh<=6RJx@VTV<LdeYbFc_L=~D_bqZ-MOd&z;6s+p<f
z<rf}=L0{rB(U0X(3;y>(_XW*RdTme|N`CgTz?be*H<uN;`kmW=y;7&dSE#`M;Lwf|
zB6dOVCbNr^;-4b;(6WZf4iKptQ4Gs8TrEo26I`sk_8WUUvD0osepE|(1t5UrQM2IG
z3Oo5wwf=X#GMK)-tyHEmn41gyYKDH4hjga5@Uw5c^1xD4*N0M=X(l9r#IjDj!z}xg
zX3Mh%a`31xvok1QHCEw5(meN(i{2Lg^loThF1$i02FS@R4dwg%XM$MV3te!=x7$}b
z=JCmrb;A2Gx#ino+P<q3;FhTuFw4!=uSRQ^=J*Z6ZZdtixT$`v>UovU>Hxb35^W0i
zQbIsVUjE4d@R*>8glL6Q1t!7Iplqe~=sN~LT+S#X8H<nT^Q=yle*CHP4@RM#^YYQ4
z9q#GiaxtEU?7a;K&@d4Nxc(i*6inhCRpb^0O(zx~ui=g&56OxRE7zl{30&!t;G7dG
zGt7;eVvT23GwvsZek3Sz$k^6iQIbDm+t7OP`hR4%d!c7U^i_$?nnFv-e;FoXK4mnc
zeU|e6kiN4&V!fpOwC%L21c)NnxMjEKt$x<;X3mug8oJ3+qth~K<~m`NL7~;-F##d3
zq8&WI?R?GBF}?^*VjN?wObR!%jnaot?B;tl2Wm^8F*QtGN8AyiZ4FO<A21REttDS*
z?3Vm`$p0h0QbTVvQ4(9=udN}T9!NaXFmn5x1tAr8*lqQ)ZMflyc*&5+vu1%A{n9@&
z*Sp@MTN6swP{9xL^G-?pKv#OtoOLiY4{uvN?w)?fY=?<_oaO6V-0>^0h>@8<D0;D%
z&R;&&<n`A2gbpyowVqQ1U_(YaK-)o7Gu)YX)>GY%wu<qiASb~!R&H}X1^8tw!Y=*%
z^9P)~h#``Yvd)jmL#u;%ipW(;^K0i5WcXz$U>4@VH(GKByj?-6$mv{nQT=;)<$PvI
zX)*aN?U;rc$3Nei_m2NX9ylC-ei!N6PCTX`zdQc78Pt1_ef-lwV@p1J2dHNr&<@hq
zK@{|+`h0oksb#GSUm?<@FQpsQ9|5G6l7kvi<eV8T%Mh7!>7;G$0eRg+-r=%uEw)?#
zM6o@msIx7&T<=GS+D8`_9_qzr@o+7A26Zr3q8_?tN3$Y;{HNi!iCLY#Z$drZ2v+=F
z$CU#DTccOFGJH8mrU)wyEPvIw)BG3<cQute{m8$56TPKX@8@yOmab7F+P)QvrWyq2
zm5*?CArii{Xh_8J@P(qJyr%yQgyi{;P-?^tO(#zzPjDh3kd6OEw2s0z)DgvuxPp<F
zkRnS;VkAD7?4IXZ@HsO<q3jhNoBST%SnfVo&Kwnz7x3m1i7=!!&iE5NCpKMZuaG~)
zc&4MJqksHZ<ARh54X3JZ@!5=*tz<Ct2~Dab1CMeBVghdDByiRk0l8afAxBG=-0D&y
zYjQwDI?Cl7XaGmPiA>!s@}qpEgKdu@%oOeudYy`Dc9Iq`uNoTtQlwLyy8fvG%t6U{
z#y-rT($i(Pp<sHzV>YnVO1wdbr1+0iO;tyL%Z*T}d=^Lh&opz+N-ay5m7w^}-w-z7
zRPf%ZQG}jRGUdTj=9k7PO9ofO-|_?RW8zWgu_7bn#EBC;Hdw;g@li!!Z)CQA55|PB
zLey6|bH9$CwSbh#5e4$2hvXk8*2<>=LbEip9M73nzsk#7TbuL!9t3(?=CKUp=z?t>
z&fMLO$jqP`-ni05v^7$meM#J%o^d$(`uun{qWAfICtDA)Ocy3n0P$+0oHjSqUz15y
zE|W-aD*hVgM)+3pGg#W^-nqgoG?d^}a0VhFfeY6&z|X{#tLOP%l;{`>(1UuT;%%9`
zGffU3F`xf;G4imQG_ekYVsLf(Oi@E<?{J?UU+pPA-ZP9s$`UnauArvl1H&0=Jnm3s
z{2uyiLX>J##2}EN+Qv;r^^whi!^v13adh_ZT?VUyzAu6Gwree!oVF_#>UK+^Y1z;Y
zt<oQcF>-w>GF=AbA{XKrxnT)P=9|+mKom2*fAq%HBJg~ar@tXBT-WOBNWOZf`P8CF
ztZf_cC1@6~`Vh+<^QMT<)sHQkz(v*G+q+m3SumvaF1WmZ+Zu=O?AbtL@pa#bk-<AH
ziYNL4+n_olaE`%@-hb<(k%tY()8MampZ4xg1N(gM!4ha{0DTG|UE>Eh=9}KVU|r?j
zS`2TquUEP$Ar!Y1pgE0hJvav`#Z*z3LP=zvPbzmR`j>*WtT&QKZ+`sxC0lc1VJSex
zt+U==@)e%e+C8`YMKFbsH{sS_ekq2uc60=Uay%=8PoEC<zrWhB)nxg5;j6Eup)<H<
zBt9zw){5`hsRuhmYipg3`pDq6>(Axg?jrc!PMTOwHbQiibPne<Hgek8A8!5fF@~`P
zEnMZ<rc!xcv3%{~2ODe}Zr)4H{>&(6CTe=u)}NIwly-NPkb}^X`62q^>_s}G)v4s?
z^jX>4;=}b87$F|OR7s41&GZ&i63k?o5EV<XFu(a~-V4M5XuaFf0&P-zKuKyR-#&#h
z(w~>$vk$PTE>L7q$&E1>ABrEFKeCUCbNobH34C`t!1a7|s{f5>qM+-7vjr~xl>;^6
z)m7m*)A=N!+!&*}xyV;YTpf&*z{;+<Q50n|J@*_$%pVk$>g!uxIWO;-oW@S#(?i5+
zwQC^J&W^*d7T*+uk3n{efzZ2t0;w(TNlHp&l(50>_WfGHETLygHz)CH?qYE@`w^QD
zY<U{Vz&Zir&sn>DZj6l~+2s0yrr=!5mb!d2`;e@`dUy;O?V`(~%ZfD11>^E>7$Xy|
zcBdyR52Zr2(W^}JWDFVBRwJKo?nsc;{85Y2=j9!SpFD$*s2IcON8zNUF0imLz(^Tr
zi|lvK+plI=IC$5qS8rm48u36id-AQix7g7@?|ydv{F*Yxe>gMq<3%i=wWC0j#ATnr
z7wObzm@FlpVQX=;I9hpweW%Vo9GYQ!DpX33U@|Mb40QGwRwu674n0h9lhynQM#aA5
zJ{zMIO;OklQkO~lVY=+@CsHZMmt-vRrNohAK4G41BFL4UGmJwF4a~83*U=q3#r1)1
zk}vp(2tMRZ`-JzG{|-p3oBWYUyVZ943Uyst7cs`~faLh9mf`+d_g|^zQQ;AfRp9uh
zHd)`TIY$CCHop;IWaatVc!vs=g^8Vd551dUbB?a_7Vet*eer*1u_hH9?<Z6qC4~gH
z9e=$ODH68R2?Xv3;yvPn8k~BpuOA+-&Nwtrjjk$V;(%|kb!))}2P{eDTggiG+nt>S
zxGu@&`OOOV(69)XzP+tQ?q!X^wqBaI+WeA_0m=Fa@K#+v5CkLe6@Et@n239hw-01~
zfL5S?-&`X<{`qbN6cO3r^A;x`hU|SG2dIulp~)m)MF_Pd$0&Knlfcus&A>}{AAzBj
zsX2K@Frv{lQIldeCu2$`S30Q>wBD8vGrK0g6zZt!aB5acB%QjEQn9Rol<?6mu)qV|
ztV)WLTkE3cR#z+jVZ-mBxPG(SY>=cT8j@S5`l>`J7_kWU*LP|Tajjpwwt*9I<Dex&
z)<E)=ugTb$?1=yQJ^;;VC@1>d_O(m`c~EBT3$6G#Rw^25X6QcGf<LQTbOIybzTuIh
zTT1T9@Q;VNhkvjLM|^l#!N$eAidus`u%p{e<kSybWxtmf+rLzX?^IKX^{b-_nc;kE
z3`8J)COazJ-{)HD{0n_7f1}Sna8#mNyLJbKw!*h+Rp&0k>7Dg~6d2arZ+k%!KvV=F
zta-waP`;Tk`G&e`U9*83QTmCj?4=F{=ve=%X7aNR&(aJApWa0hp1YCtGM|@es^Xxz
zVbjfKc)&AGPE+~Jn)1pQStY1n7l%{DV>eWiorVsIWL}rEb;QgE75LZtB~K<1$N;F8
zZlb2TTWZ?{{1s&4A=t!jOrKU=$V3%><}>@=K?&i8ENh)k=Sd4<?f52pA^PyBhvqpU
z<d_3y?l14&*T>u9S^e)tq@qD%JtM?;GOYWQZ_3z)Vzk@PFDh@I42iwc{Hb6S^B3*s
zO<9}?1_`jTFSV2YJPaX^7}$83as;+c#Ooy~DO)j_2l!g}CI-f~o9tjXt~^%k4d46z
z?&#Y);Nj_QqB1<@j>rKD(UJ7j>9>O@k9d+mr)UFff^MBnUvSM$9coPZ*Zqlr>hHEm
zpp0aXy+Mwji}&s|qtsNU8-Y&(&U}>X=;4pCy_q^g54%S7K11~@Z=jMI))O*2Pu!5;
z71-TyVqsEkV=o>Z&vAY3YQ{gR$9&J!@_lBZJ^Dz%(qrI+l~R0^Y{J-`;_21o$?DjX
z5FU*Cn=*JXvs>izC0iC*Oxj)R0g=(O81_xBn(6!dw;LKyQAG5bwBC0(%FNM9-XnZC
zB$hSyCn!oloRQJY-pBL_(1PMUe{^*U#l#!>Q~k4u749OjO)Klf|CYnuP`KDr!k^!p
zo%3G6C4n*q=7rH!ns@KP{ze1c{b=4hvUT1|Ol;xXUQlO&W?uqHjSe28i`u<fbiV)C
zj5T~+{nTgOo#yAq@83oIp41q0I2t_u7_iy9KXY8YbpXP$?)?M#@haobyC2>$0C<mg
z>cqh(y@$ed;)bG$Kha2D;^nL-ai>h6O0~?b+naU@;pdDc0^ZNeP~MjUPwY#tPjsr=
zmJGn36}e;35oqgIP!6Yi7PQkA*@m`*nH~XuZcCW}9iv>XeBy<Y$4CmPd~~%m$C##O
zM}xf4I0bigbZ<S}4)&o~X1cPqBG2TJ&)RylN49}hrM<yr04<~5=jHwL_;EV1G$To=
z6F9iO|Ek=kXwtiDI_G;f(YkY<)g>O)mqy_=v2xLk7eYopJoHWWHH0xn`2$2(HK377
z9obRw2Hmh;6+um23h@S$OuPm*JBp?+Eb_|XqTJ519*bgTAf+gOoGL`y>kdV^<>ifx
zncEjf2)xo^xzyM0RR1kJD#NhKuHik+#-UV+B`wJ&I*$;H+GZ^Yq^9QHXG}$Vd@DJP
zUKb75E?Cw&(Wio3*HMr_vL1AH0w}u$Z$!y7Vl^6;EI|S0j?!N4{!8BSqQmquY$ns6
zkxB1@G(jnW;IB=d1=5!pt#buNKmmN{g5`{#%J8c*Hjv<F(sQ$gPX7;Q=UOG5iAro)
z-ccbHyXR9iP)@<Ev|Q(Q#Wcx!>)gx+hQ9g@-ATq0MO=QmnIZg;_PqW0prKDh8X<72
z2=g+pFB{5a?g48zuO*!D$LFb8yC(5gmYLu9RUAk`Z^-x$R|)7BmQH;pTCLY-ynHLi
z;RYWrtEBeI=GN#q5sX($?$!~$vptb``QxyEK61zVXk6jw-#cQEgkO6hHh}`a=9?^S
zWbOx3Jzvtq!uo%X&OM%~|BvG%B=>u+u@xcYo?8+lVeXW<6mq|t`#s5>klY$d=DxX(
z+{<Nh&0QuWF<Oj9?)=X8_ov6hKifI`eBPJW>-oy%e!rh*n><@p&vIXWwq>z-OEJhw
zRLjz<M^9Zcj;DKwdO!Q7WA<B<{xLX1P8&8rpyk5WPc?#5@eBOVY_EVN$1aXJ>3eeT
zv2FuxP1^m2vCDJvQSL5;LFJzeiA@~WbTpqC8};)Mnl0($`co`t=fo-+K3Nx1Ym7d4
z5NAEsv&9P9ld|>r`d5+lkw!qMneS9Wg#W{weE<A)Z*7lOZd!L@60HWE#&VS2g~gdp
zQp{e5kU?v<+7({dvpK6|25M4!*DWl!)>+w|+3$;6`NV-&M*j)6lXkYZ&$qYJU6Tx^
zzu-+n+82CUR2>;uQ>VW9C<5fFSFeo3-T(mxER>BPY4E?uVD<m}#=-5j%Ln-SNb;yr
z^xv6B2R=*c_wK6SRqu=hj2|a}1g_JgpxFn%I=owZeoi8Ub}f)Hch=U7^^SJ>uP(Jf
zGV~}uvA}(*$w4iPfwuHEJbe|b*9(xZWy;*F1#JL#lnYkCf$I}JxH1YqDksP1ft;c@
zgz8UwE;I!PhifFtF!g5W&o+JRB2nBxeGfvLtoe;76Pms5MI6@6dn&&5k_0fbyAhp`
zOso`c@Z=qbevymSwiN@6U86&8Nf`zk2C_+KtOQvCTtsmjNRNdvZmnndW--)b;)&rK
zWW`TsB67&cf2@tF%%U!P4P8mor#vSw34s#}%C5Yhurc#+aRh8tdR9R#`ZBYBf~>jf
zaaB{W=KRF6P{8F(R0ZiOzd>fk<j;I=zQV`@DS=nb<_Ra%NbS8(+>oeV@R^c-gaOLR
z$eR+hAzeuQSx&c?0UE>vNE7d48~xE%Ho$1YpMV5Dk;UEWqNrKZ<j^8u!~~~|WN!oH
z{9%lW!=iZ<pxHR-YY9NhI8l~zKf4aw?AD`{UUkhJ){VDk*-yR{>{T8A<2VVVv3vj=
z8u>O8kLht7_CgJj-IZ9J65orZKrj^sSSY(-BsR2k!#3y%{dIz@_5Q-r)v?zvefph3
zr3)RSl;jP7(k{(BL+C;GqR(p;FV1e?jG>H~Qlf+)6lRr3z(MCx=GVb=Nsf8-aVs0o
z4fD&4`$B+f!ARj&EC^-L0X6&mm%r+Yp%V>7ps{0)@|a%J?2L)Q{(;9k+6H_%y>yD-
zHuJ>>;oik+ar+N{DXgU<`tn0%)#us*j&W9Xv(_<HGi*1i4Nu>=XlmTJ?@`DMyNVxE
z>aHt{shQ|ac{Krd&H)_LKA9}`XTKxq<a^Nkpe444IafT<PfNJ4A=N>N)dYr;3KHw>
z+vWijyxPCw|6Bp_dPsx|7EH_Is3(P8&xEb|Z^jiU*GZUkw<RQ{_Z-4U_FKIvlZ930
zB}%eT+?xk(_RLQCyXMRFc;zlblNNw5*lZ^lt*uR+>HY5r^bu}`iDhn6;182=)t1b?
zaIC*VW1##(Q&bt4i+sIz@#Dp#&F4WqGJu4`m0>@U5IH{6)4WwsHZg7ml|Ug4fqaGI
zu8W<cgtJBU^1ai)dxyI+@+*yB@3jM2fMlm40)_<4R{($PdNyw2;O(dy4Dc<rZ%LH{
zdG+1rn{$+Z526o>qYuv$cEd)WMBIl0_9@#ro)W>Xi*0{ppO&}H9;^Hd7{_(Lf)`KO
z@%PSG?|HWaK`I?;cfoZB*J2!zWp2)xrBKZp6dX7y=s%Zde_?s?NsVBtFUDY?n|RG~
z&v`56T}~BEEqxTEUwQPR1}wj%oJRDr2Fgl+GL=toO$AEgMWd*{>@QoT=28;BFW$AO
zy*kAk*lxT@Rg>1SE#1#CnyNefZ?Pee?{#9u-FKjD8`Ck}0QJIG2_NF|*+|D!fFbkg
z_-CsnkRY0um%GPhmYg1d`*s#JC-3DN8{9Q3%YbqF>;I3nVBbh@(^sj<_NoH?x3-}6
zd_sr-M&Hl9NA0Uvx&6mv-9f9j+i>}RS8#rliaB#f8h2wUG=ej|xe~(!a&C)CJf*xe
zz->f$4Y03nm83V77M1{5pdb&l0H!yPTh(1(5|o9|jDyo5I_++wvH_uxda=70C3goS
zSm|oV0a*TXxTEBw#y|OQ;H^1q781!Gzpcekq!XZ{;B)-XYXiU4b^O(V7SL9RFKR=@
zr0!cnXD=@|r^$F9S%Q;qkb3c75XkF1e{qK_$i^Q+xt6P1g$Fj(OUnoYvb>5%=YeuL
za#D%0wY1+VQ~OD(or|Nv5J1PKxUZj{*q0rq9XWpbh%{hdGwk=-p@w3G+iZwBvj!~v
zNg|gRimWMWlYv90V^q>O=8Ry~9}P>7g(aMJFC<%CV_Ov$+<)EIg>vP*LC;<HNcySs
zZTsJosQp0Kg*?->0)6ctL#aZ~Xm8DA>q+T5-(~}))LC@+`$4S6)b7$>VL|Vz%d5+e
zT|Ud;y0EevJ6-d=h^w<Y54PZuaF4_Kqh-^uh7qRITvok$sp!F=J2|l6v1f%|8VGL<
zDC2Lp(e|8v?Tie-Y*eVH99y5xAv#VFx}niElWp|)=bEAB)AP<NS4>h`zElOneY7b-
z_m8Mn+Zi7hho#oMr1O^e*(My+d)=><<73Z@+7FoXHm1jP3iRLA>((A}8JH*Th~${z
z8{JfV89{+w?SO@_oe|QnRI5=8?g8@Pfrt^2xb+2}=&O!+@BdsAsM+3?cRX||PU}cO
z;q^OHiZJ}@UR15_m#FRwAkVV6xCmft0B<ICdj&{4Bq48MJ@4>&q}4xzP3;54Y2*pu
zFDjOrd%p_?xXo#?e=kSxm~n(eMV%iWuD(sU1M+RPVH(h~7b(q4KWRT%Jl{4t-!lS2
zHdFrFMJ`{=bic(tXUV&N$1#9c8b>R{YvutMfSen_j_dy|F4WcUE_c1uf0u-n5xMIc
z5c<S}Mjwcf?Op{K0`S@{)ZB`A%-b#(d#BcRj3kz+_i%Uh1GdbP>&NQu5YTlAJ5qh-
zp$FkM@@`ab>U1H;P<=o;s02!aLUgu5`85lVpl0f(f2Thh|E_|&<_cIMXdVjH1^Pjm
zb)=ag4CJxZFqepf0|h&YL~6qn6OMdC>SyD}aY`QFU8m%e<$vSZA$4`@S*3t^^FPR>
z;#<30CC>B}dD{C=h3M@))BKi*irKTA82V-?)Yp3bvaYhNRL8t$8^*;MdsR<tu&oB}
z!D;_YgdpGALJId146JRQX8X|O^>o`m2oj+#7{5|Nu`?zARi8NSy6FxG4&fvL`r+d@
zvtXl~!ub-NeW475<y0DZvnGhsYaux?-Q$izD?(+<$}u&qd3I7BFeF`nW)1^Zi}**C
zv7g~Y8&VC$r)*QqONN&i57xukuk^m?o}V4$C4&4wXwksK59?5>j0o2n3+}HCx`VCL
z4z#SL+5D@2z}d))(K~-H4j=rin)nDxwDB;MS<6E-&&G?mKB#HO#boL-a-Z&olnS>7
zB#oN)yg~V+=@cG0pFW#!offAQU*@!mb8b^DHIWEgdXKwz2^8z2*}ZVXw&NBjNDpnA
z3~{e3jdvJq_#;Cvly&xN8ux}S7WMprW6dtGa>z*4k@^XdO0YJ`S~JdB03C?yJYgm~
z)rGu)OI%>9`qnH4g(AW;jOIUR8~QtT+8G7R=nK8b*!c0_w@vJ`)I`5AC8Mw)%zc63
zJ`_as9bGkov$qP*Qs`5P$y?GpTG4|^7VJ9-O|AN@9qsHpT1RPAw~DwxHiA?!xet~T
zzkBjv9ZMd42?(8hdGGOGH*d141p6M*;>q*{R3$hIdltlpM+>Q3Wm#!9Npz3IeGlCF
zFC*kXkd$Hh#b4&=djk(u!yx<v4;M%F+UY?fzLV1k%<R!3W`d!VW3{*U1@_<CT83Ge
zaz_)a{kx>{00$x!Rhm5{&)7N*tp5OInz4V9SeDG#e?fUnxl_vhi81BFt<!qWyBE|T
z$ni>oVASF6hwZ@l@wgwDxm!Y>!Pa+0OvNSGBE|<BvOQ}MW8R7j%4I7m8~MW{x}rV8
zk2uc(O0%qs{4Cxt4^R^<h?HyMgiO-wb7JGf%DK(ZfK)HYwpzWMtoCZkH$wUPWbx_G
zSF6AkJ*AY^WBRTv8+rrv3rWowL0!KD3~?{E!#xV!WeUn7L&)*8Wq!#rU&((u=s#c5
z7}BnNVc8y9ga&vv85tfkkqtwE3KA8+-B&t%@MT2#9ChYi3u@L?rL@&ABk{(}YZ!#>
z9Po+Px1(OszS>P{9U~(re{vO@TE4$f8^!)r``UK(OU*x@XXwBsAH}!G*P4l|O8!n1
z0NE0@+A+1m>LE!B9A!v^@jVV3R%sAPEo`4z6IKF-EL#77TNrUf=&DQJvXY?$&8gTV
z40U)8@v#@suqq%IRD%Iw?RGFk!h)O2)e!wKPZ>vBSJ!-OKLbD&LsSH?*X02^ndhp-
z)P{}mlhN#T4i~K)2IV;w7i@6LFLNeCUD_T_?_X_Y<L^xP<Z4*Ij61iJm&&v09|8V1
zF=bM7XCo#@ZLaD&Cn!4`0i^x#dY~2X^D%X^A#1Xa4;Da%<*#E-`>k~#F8q);rv5`J
zLKVfdLbIm+hD3L{jhmQ-D6uv8K8;O<zJSq;gSCmn;~cuYTn8OUiAMf=;Yy9iv=3WY
z<;|y*hlS3{F6IT#KYMqlkSnO3$M}wkYJU8qT4}>s7nSTdKL==Lu3>IuD3WHgWS}~M
zlpY7iNpUM?=w3AD`}8tpL&0D4C7g=(U~&<9Btv~`E7;ydC!=WNW7S<Q)}J?3O^MTq
zi4{zq!%x_<{8Zw2sxcfjI-MVx9_fDqb&Nfw*J<fWUN}S{czd!eg*ETU8vnAKn6aRu
zent`T2=6WV2vk7Zj$gRFrWc*8pg}$7AfjL0Km-3Bp3OPgare1@T%FBr_!k*R?;jc0
zp0%G9%B%x$U#U56LKg`TabnlQIyYQo(pP<=PZkCSZemi6^~IJ{Nw#Rc>DzpAanA}D
zJnx9#x|}_=3A_<2Aq?Pbn?wVrsL-h0rD~m^bibnVY&ODg<RS@hngE31d4iW~(UT(T
z$F_O{ZV^#Y*W&FLd{VJpZiB{yP6am|8F<1~u0@}{eezWX3u|8nP>YB$%ut~5?L}nc
z5bafpVDJcvsm3zI*`+2h@tMSmzv^=zUi(2|Bl0xX*0rF_OFlE_3rPI6Nk$#9&Bour
zgyJ#j%>6uv0FVLl{PEs*%BPObx1(AXfP}O&=6e9Fiu*&b;e;*(sv%oOR82F3iCr&X
zF<Ac3Moh^`Wh&LSG97eRZcfFr&pNoay%;tH`Z>J$_w-Z(0$T_XY-@JP{P=MI?jj)c
ztArriWNQt`%B4`*LxDvP?~b)M6Wa~<>2Y}&5B<u-m(#U<8Iz7a*OpQN%=x1k6?dYo
z`y{#{w5Z%KRcsx}>WdTsxh>=z8_xAOq{cl%UjdhLd?hrJR{&Xp*MeOFkO&(_kgqCx
zHmYSW={Y5j%GTOxptIh4rBJrj_9?Z%QjS4J2VP@m{QLpLf9wMV$rC-&GuGy&m(aTc
zFywIDnj0tQ09+Q8U0!}O?P*zm>T>Oa0wody&l)VP9?qaw*+a>dh`eSZZq8actE^ce
zT&6_GhL&DQV`Q=jfZ?o&$nJKzS-3{P((eR^v|D08(kmPx8CrznccJo~HIM;p`t<Uj
zY!F;utfB=#*=T<AR(R|5Jd1)?3Qnk|f&p;s-+!Gs2tX9`{R_(TTj7hv8XQ6j^Ap|?
zo!@9oo)cewoRis|iKm=+Yz<_zZtOp)cj$P?LzM$-gVV7_oy{1XZ@h}K^XJ8-I@kNG
zBd^>S?!mgdU*9k(J90>&GkH4Ga{I?OO;iRL@}fyLFn)I8;`MQ$84>N|VejJ|omQNU
z-H)4VIDnr~L0|n*2zpFkRikKj%-*m;$PIm|?W+Bq#c#sIJ>YrU)#`o8_woH!S5vVT
zu=HEY$$MMcgPL;U26PQ}Y5VWh#j09lX<9%|i(Y1NSDi+gdjlJ6{H)qc;`FC{+w~Z3
zonbljJ8Vt+jQYhT#Dz6qKGc4ipR{X9yAO`xnrT9)Q!O{a98-aSW3_9qK=0a8&(00g
zf{lRV-tgdntzi6s4}<lx_9#sJWdnWMvJkb~e|3(SVG1o(8X%P4gnW?kr1Jt0^dn=m
zrlIM({sJ0J3qMU|=fYaLf!U|l6qQC^nH%qbA3x~*?sJB<0_KEEtX_(I%kl+op(@1=
z$;WR3$2{Allt@}ar5qow4ZXh-_p9T&kh8CM>q3nJP6d>=E++hM%w&p2Fp(jKLkh*Q
z=ID+_SrYE@$PHLQJMO@yxE;MQ0Yj1=5!bExG^d&v>8r__gKGimz^;)S=N1(fb`A_-
z0V(t`?Beh+M6KxKEYb{}8nYC-T7ys|l!EO(MwH(8B)0BWFknB8UFN>dszM5IdJs2b
z^)I5!_4ess|3tqfPes^=V%`l_AaIF716Eh&$HoFvNk>dGifc>6{s-jqS?*Q<q-KbR
zSW7}g@+Bc9G!<@xO%Ajd`NRXz!qL_(FN1q_X;p><UjeqJJ*^g;IM+LRG+C`tT27UO
z$IySyh0lIqPI3h-lAGUy06T^PwB#!Oq|B`y({3Y$<tm9!Z2Be2rjHXX7lx4KpB%*|
z+Js8Ufs?=z!*sj|0URr|sdSWl141WR#R}vb0y%~Gd=sdYqggnkkOJ<P4OJgEvGXd0
zqrpP6P>tpT;lAN4Pch^a5}3{&{{5?F_*%iad)X&g)kl(t$hXsV3k3`}#wBReY3EcJ
z=RyH3yf8ujE6SN=!g##^AvTlnqi}Uddnk6C_XRB~8&GK|k7edYKnb0XoT)&0+KuxL
zz!V%JJ5p1H)Mk9fDAAWHiSm_vHqyZ2oOA2Ckoo?^V@3=Ijd31T=ChZ#t_*Td_k}x-
zK-8|XvSLh8yri}+pLRunj|a{)^*|fYV*h%?r*y<d5RXQJ@-KOFK6uRScEu}xe;W0O
z^;E@Da%y4}Fd)5Hxw)X%{iMY%+R}m%Qas`~Y}I{jzPo%p{EYmt(}q~OJ;TcDVY6x@
z{S3PyPFrzKJF7x`QJ=1#8K}_(D{N4|8M||Qcngt0f7VPZ<12gs%6WN9qo+UlciC{v
zn?6&x)4cE0->7PX;*myo$!N#%oP6Wj&*ID7oONwK6jBbUD00$dtunN1lKCXdoWOBG
z?F+$?Ps9OT3sG7g{O929S=1{4xFkcL>9juTOr8P4ACBj`yO6hMTiegN+zQIe30pzr
zgy=)^qW8hU#Ttp#ulv8HxFEm<yad<<=dkKj7THO<$Pac9^fjn_y5m}j@ABd{;7?0m
z<#&{Fx^@%t=fsd&rFYGNLBZ>!-KSe&wkr7<cRU9HAeVYO7V-`F<%zJGTn%y*r6P6C
zom87+Ml_zy<>12gah4b{0^CZ>@1<k-axNU9S55|KxMFMLA&XN6GOYK<P{c4!h%5rg
zctc@G7JOOyGmf|VL%M8cS`B;G(F(1<fVQx`cX;@$w^JsleIDpGUa&~KH{)%t4t5*U
zrif!1+6+;DVwlh?a}%rdGqmWWEBavl^J8GlFkoL{=7brT+FjeBgg{FnMjn!lfCoSQ
zg(2rj`&~-Aglds2ZbxS%u(OwDfYbpFs?jnc{%mn6{4GL^^PS#W0fNh(<-@teNX?nm
z8gB<P7&z1l0gVV+M3-Up-R+Q~n~tg5IN;>bqG3gn0p5;mVowtKK(U-^7!rpDe6nqh
z+_G(SAW(AUG+g0)wQNKLVg(J()1Tn$QwDxETsnpbz_aFB?*ocK5;#(L4cmSwz8wW{
zP1rk7iB{03-rmlxU_uSsy`^xEz#VF+ueg1n-CGyO8efx(^sH5-uf+SeY2TfIP}x~O
zZHCp1=7^y!K+?cM**=ZE9(i}C!{7!MpaLTk?2))iw*uI5ZP1zor3L7U*3JT;F%#+k
z(kdRhf(`^r<Y&a#E`OaW<u4Cxo~3(-6OPlxX6KdkG}bgfF1b3c>_|a%B?<YH=XjGH
z!rj=kf2Wlv27}_O``E`!Hz(1c_a#k<pdm!)DbeYqWFY>wm(2W10OVDp-l`Kw2dOqj
zM$sO;v=uS$mIUS988+IvHeHw<XQIa~w4JHmA&<x4xYtv-C?t%tVK@x3oYC-h<bF=7
z-s9bF>~Vx%_9m&;(5U(ssW`*7fk|hWITMT>1QToOaiK)(tGuhH4g+03LU4eD#Sty{
z>=xT=ippbuyM5u>yqiNmZLiez1h-se6n_~Ox#|9UU(B|gnt86m0R5QuV};10!s^-b
zM-%J*upJ9GafzDDyiY_dm_k&dmG-)%+^u2Fyvkp`=v);g0P(PJJpRs)FdWED!#cm$
ziNFw52&bD7D#eDgz&lfjwcA=`F@9vGkB(eU7zeb%EnZ~O#o5K*h2``0mGk?O4BOj+
z?qXmnLd}79<Gj4>!CA}lz3z@LY%Wq@Dv}zjrFS6~f4iKc+^uLmlpC-((1HL+SUpfv
z1LgqJR|&nlAvYS{^8`i?@eU~u01%USzmGNvraXaOUq3-Y5K;uBnfS}J>!cS7(9*JM
z)HLt|)!>f@geaf0Hm|~h*8K7kMdO?oDd3L<b8gZ4JIPK1R`lTaZg>%HN1}8ol7XkN
zI8lO9oG;vZR}>2*7gT)5f@@HeUrO>=22rlKte=xYN=(2;aMic7EIU&)<g&(lw>IF4
z;q%B_;P_B~Bb9DQ0cJdzuI)Ynu1(ZkbC*zn0rxaFrhK?EC#<z7N!Ru=;J#~b@u@-J
zsjC)hqH@LOknEr=lkA$B(kk!`$(G3Oi5zMWklu_$^eVBq^<CLKckA9n*T0#;%>jFS
zvW<Iwc_ml%&v$tauL08a_BN@Ya^mPO@DPk#pf@xHV;jBO7n)%oo|v%a6Rco&SbVQo
zQttuEbj%#lSKuiH++)qXhVgaq*vmREYa&t$GJ1L94ys>EqvPPcTg*#TRMvz9b~^j@
z_R-6_M0^};QVlA!YBi+O24f>9M5m8_+4_xoy|nC#slFa!Y%^@kG-bL+;mM*~(duRh
zfR%@tp(Q%t+mtcWiXY3aU}piGj#bQf-j7Ql_vG8vR&iutd`xt=cS!+HHG7R7t%XR>
z*4&j5XMKux;$*^HpwnW<3UD>o=pi`<nKp=Zo4pFK7HVTID9wTc#KI_MZ=uZ+3!)ou
zU@`>aOaVFjp8%FQ!){IGj*iov-UE{MZ(hyT$Md!7J>p#2k5Rnwpm!ZA?3kLbVKsEp
z8(-W~%^np!TEX26Q<h}^$ZK@^`rdHjE^o(6;-8N~i5JlCyUDLRO`Sc?!a5CH(b!q~
zt2*zf!5$GCH9lWd2rvACBDv!NX%SM~1hLT(C#P4_92YeD80c(?+Y=URD~=sdL&dkv
zKpZ?Ob?&{vEN`u_`&oih1sokkF&pV(#oh<yc-?Wf?f;x(!@QP4LsUY0@x*c_MV-Dz
zQ0h}G=sJ(1EiG&Z);Mpu0-(l_a<rZ;?OWdTkmbk&FTKH&aUVrq_~Gek&q&+Oov$+e
z<sxwaJ-2!*2)K@-4^ICg5l4r+yK5mDZ|<<YF+J|uy7Sg_m^>g@?)ODTT*?J`DHGX1
zZYEh%A#uCXX$xD;ZoD^YdZy$l{t})<oUbpwRT>yx^H&j|pZlsF;fvt`0vj<XOAjxe
z^-!rd!1+o)XKHGQdwi2CkjVHhTNd`ZzS*mwi&Q$D#}4XuYqJ|L^`C0$0i^ZG*G-cF
z@)FS^|85$4%m!=6I~iq!6R2*?yE$_C04ks6DEX4wU!2UpA8!t8H0>7~>osOUdBL@E
z@_7OxWI!4E^|Ra_r5vaMdlO$7{X$0`%pCkEr(-+_m}COx{n5XH1R4r0THEFQ$d-A}
zn)=%bxy1b;EM`J-oGQ9y8R%+(@QCt<t8c2NM_MhH_THX(U)6kVR%N^%I@zNGjZfu0
zjl)?x=7lzHabAIP+|U3KQUhr~>!A|Zcp%s0(Xuw7<G`+P5yZ6(>y!@QZ1Zom`8sSV
zseyjPAsv4lrKOuobmA?CSCI|F-?3;K0%zciX_F9#Hg%!16VM83O@TC!gz-3$!)yn=
zI9(tQVDgvT-OKG=TAzTnvCNdtDg`>G%C2eB334!WOK;`!TVg#T?(>?_#Nk?5cnLKM
zV=9eh1EYl1&7(~}-Up0l1vr_}ObIMZ3xq3R;eSNnpD`V9n+_Zy5FWS`=6r45O>1eL
z_T#4RvvMhP%*lQYx^7<ybe(xA<j>sJ2wwkR;?H2fM28|tPg0kKx7Z(p9LygryXtJO
zh+EGZ@N+G2ZeOI;tVCt|4?fg-LYtZDY|A67^0!_^hKJGXs-f7Gk{yQu)Jkzl&S^}d
z>)@Ayu?cxb+ES$12sZgA1_3x7@v1^44HM<Po22u9q%tvpV6Hh;aXD_AOV0JuXIw^O
zRR4&Eb=o7&pJ-8Z!6gkLS;aV9<=qikkxw_j<yO&q69<S|ktN0*pI@}_(tuL%M&1n5
z<x_d5MZNKqIdybu=8hXL=+ksLn}II?-!IxneSbC5v8Mfgq<gY+P2-p~p%MF7?#u7?
z`lsczDIxp&U-7ZuYsk5#?@8%VwQU%)mEU%USCXh(R4%oQ@Uof;ido&CvtGf?tpuLk
zYPE$pi%GQluW;U@`e+NaDQvTRepy*CVk*8&ZgzRE7F0neWvl%6zx*U%Z2#HnU~g~l
z@K96;#Q~CNl_t0q5R{%s0k0rl9=4fM^d9EgR%Y=z3p=Zt%j(Pmszgm^6_SOoa?S5W
zO&tKs;?kS|6#@St(o*Nb)<0aReOLR!#VO%laoOg*(}atiV8B=2E4_C%d$JxTkUn6h
z*XqX0Dw-e2DU#a&r3B^5h67`@=A{z_3*|!=&7Gbs{YuntSz!S6l&3sapG_%B?@yVB
zUI$MEwO9n<FZ(d|=cQr8t_LpxYIf*|vJpVtv9So1`I?!JRsg~gNG5!tm5JY)(+;}4
z;DQl$k_-FSyV9?Z%1PSB>@NXKX1@^`N|WUSU7a}W`-rVX){p*T!fQ}n*v>)l2>>S^
z5njvAs}3}u7os5mifBNlxiAVuI1z00`ju<F17Wmd^T77nO<dU8r7##awb3G)TO-o}
zf{ZD<EHrEPD2-C~V8x{|j~tOAFT>13nm?*S8s3fB`=t|xl?e_E&tqh$K0T{(Z<pr)
z7Wg5`)24&GivUDoy{mJMDZ903?!Ntudt(EqL`@b%)4`r_SI(ntph2+>lUDtxJj^A8
z<TtY@b2X;SZQ+N2(Q80ACQAu<tR*TvKnu!Ag}d@PA_H@1H868in<2?$HKs>i9>(JL
z)5ykpn-MOORvtgq8td2++zJBkfy>SHa{v{U-GqsJ+_O|Y?#I)5?LdgEXQ3kohzR+7
zTs8~Or~a(8uphb5G!|}ix=qF6pI+|QgoA+C6ucg|nhqXU_Ge?}hJkb-HTt@1uW!Mc
ze3@fFC_36*+4q0jN@^NEBGOYO#mEho7Wc>DlajPhS(-QUP84sn<Q@$TNoO%3G&Tw^
z#jLJ~YUB=MN#V;>jtp$P9@+J>x>#9_Aly)!emZUOmyIXGgDTrXa6-hC5{cCf<ro4+
znAg?w_Y~s?YiEc!?#~zafaa<lu7|z1w5U9kzLmfG{x@S-dJRy~fLQH`HVgl3yZzp4
zJ5qc}&^d`+$WMpjl!zVmOb;(do9?bK5;7MMLmsvn3PJjuH;xhA7tTHoshkqjHkHFp
z6`D3)5fmE2Z}cD9l)m(MX)S)o{h{?Y^~gV{@dge1hi`6iv&bA*O6ork2nRtRva_M2
zw=8fzN|0Uc3-5Y!XCebjlh?<D_U5hQu2<w!*$y>z^{P%vHmsLiI=coiPPsM!WQJii
zecp=TILyqGBthntQ)*W<=9&RmsPG(y<Ujf2VDjS*uq+6Sv~IxTHN}Kcl+{C6w*}1d
z7~&}5;%{~DiZtx?!Im0o6%ccz<M3PF3voK3o7QuU##3Vy8gqd#S1TpLvzo>D7+tZl
zf)qu@S>@;E*eCX^F)Xo{ji5`a5XZu`o)LvO;LlIZ9{4f}jnUDz=YPadeg#qlgkEL#
z@6KgU#jpqak;{R$$cz8t1H*63XDZSbDD^CAfpkrhpWV5qa8*_H;=*TS`;Qtq%{{2`
zVa^vQCF{y>(Z){c15i~k8JyBT5|^maSMOMB<FmR^cyhS=o~W|GZLW}uH1nQJOv)!L
z7{n3CFNBpTHrhm9I|^yeF~`ramiYOgvG8OkPQBOg-%Dv2HyCyXKz%lC6T5<JgH;MO
zeam8#>Z`Hq*%7p~U;0L}P3@-K<wW9PyxumMgY52-|NPtLqUGIkj1p<Ze|Pwx%L0Yh
zXA53x3ou<4X~>mi4JWYr@jvh&H0Tvr3KRH|!AlBa;u?t~r(vz%ihl6}qMak|5z>U7
z0{<Xe=^txKkVww#A07<4EPLd^b6pu7h*Wq{PW=ls*2&6%>34^RX<_il*ubZB4vPHg
zQy4z6kn)!e%ebl_<z-U7v{Lz`_taKUXA}@Q$#97*|0bKo?+x4|sa};|fU+vq$;>KE
zLu#gh&vXbo6U9AZ;zD>;(Io!X^Re(7_sV%^-|aT;c5iOOu>=to2XktN-)-sNO5fC7
z3g1B95z(fKZAsPpwe?^SJ?qDhrlCMTn{Xb&+;DJkxFHC=abK2y<`a0nv)TKWyQcF7
zb77TyC74EY1j`L6Kv)QAc_is_*v_BMjV!(NdKylm*f=7RmC_>bF(iPgmKi%=O{g+~
z-)T{yM<IR_vaZiRbuwPnre3s=PK+B-$ANL_Eey+Bf^7;+%faaviGtf~4ES1g(z$0Y
z@6v;*sG?;6YnS15@HOcl6c|HCf!u<$)7}#kwC?fS((2H(vLy2S<9M}IqB}CyNZ-el
zIA2LuAYd58nZ5l|JW-d*nG@e9-e!{Z@j=0MqyF;VhlO9^EhyFrCYplr$uH8e?eC}+
zJ^fsmTgajMjEZm1BKsUnzQ77%e|QgtH#YSyrh4-g9e{&6UjbN<lbsOt$ip3Ty|W`i
zu|>g62U5Ef+oTF?xhpa}DoQbh?Rg&CXBmo5n*m|RJ2xO?HNHGD(WkpiL78OZyL9Aa
z#UdU51Bv*<l+dd-`q}{1#PX9sQoq=n0)iw~hUb>0YOj(OHv@{wUuM)yF16>G9uM4$
zYF4$HV}#)0VqA7NO+E7ASWTDkFaNSbND1#>$-r@wYOjXdt}zdM3na_ex-qKvvHah&
zLf#7ULMsy-Y0IYo!PIvR|0x7v*8MeC<la&XzN<4LMJ7zJK%R7o9RQE!=H@!tS~Ov4
zXn_fdQS|Zas*Wx@egy3`7QJfd;o)A4#&S`y4`7!=ITN+j_RD*pn$mA3d=9UDM>`!@
zD8P<-{Hn1+TBr`e;H+X^l9rB_`y*Cv<8C|U?%_P9ZQe;g*6L(cR(ZC|9Q`fsWh=;4
zY#yTn)C@v64ACKRO!f7>8e$e=^Q4Q|=46F*97s<@lC2uVA0II_>Wfjk;kdmpj&3C?
zMsg6Qvrn0bAy_HtWCiF`J+J`?$X*t?z*33<1k0Yo5>*BmXwizQ{EmqRsdOn)V8BJ2
zd;1CKX=8L<wWa^4XX7XvB(HTEAG~<P;a&|6fv8=(%ubm-8E@~q#QGj_CJ^WYgPV~w
zy}Sy_;MDiuRvb}Mydyd66tQ>=F549qcdNfz9o5}M`ilV)%)F;&MqswEAg&Q31K?n>
zF*B@~4OlzrYWexb$!gXKOW_<B?-x#1Ynjc+XfC_aH`?X@uinF25X~klI-RyGtY5Nq
za=&19R$|V?+ycInnE?lGypdJWNzd<m`j_cM?9EKpwz7lLDN2&8{Kp`s`t?i(&?ekn
zf8Q{EO(AMGQ|cdon6QIe%@Ss8p97_YT^vlQgO;AEI}-{KZy$ZOM)kIp!tn*O+^4N~
zgG$D!@#%`j)X#^WQ8UB^EPvXTezSPtC86qbs@VB@`19DYXZ7H<%5xgpuIpLbInO$c
zDj7@7Bbb@+f<K~Km)7{}STz@J-hgTYgpEI_D-93&0BZ7`xNr3c+%-I=jrjq~B9j=Q
zgZv!n@=t?yCcaMx`g~$Lk%WR&`N(<7IWCr8&HHA$FID;7c_4D{=lSR@wvvsLhm4@`
z;AA(>gHP!vjZIir?;oX<`0Y1uE${!hgY&qcSERG(+cW3a`g5!v=NeS8Iv!kqedse{
z%*~4Mp452<EL-&TCKOpV*u&h8Ir(>L;}IDW5f%}#(FP170oGqxc30F{SZJrvO7GE1
zfcl3)Q*KG`57WtsfATwt9LK%B-0^<=od-+Q7J=%g<l^pE_11*7<%=^PVnN5jl>8#*
zj-nCWN?Xlvo57U1;tEKvTl?_sYrH#n%bhdUC5YUxSP$*fFyzEyzAFI9vh&IirlkSQ
z6ihb_gon$CJ=bR2i!{=$EQ7<vK1kX%8{5^kgxaO{2aW~XHc9k#5rLK0?#g7r!I(<L
z--5Yp#r76pM=kJb;6F?uEhWZ{8l>e>xS$#4$ud2=t<L#cQ;Ae#M_gF}|5?aFco&ps
z%gKe3P)VF^)o(Cw{^U^%QqejKfgQAH^8iP`qu<CzE2cF&R<~-FI~NoqN=Zt-z1~H1
z7mHttaTjyi04SfcP6|gftVzjEH1LvlvFi+DM*x^~GYgUQ85k)yVRZ03hTRgwK)P}V
zt8Q!RDv&I`BK`O{{y?43Au-$_B_{0EGlsPkP!6!Z8W(9oB~)Us6vyo@_O4C>!R`m*
zY9`upAAb(l@fSm7A*wBqjTn5Ez99|0s>9wdmU?2FR6PE7Nc;Ke@NII42QVZt=FCeH
zqWbEZ%V7LwZf*`<J;Nn8wT4ag&K4oImF%x8xzt*Ox8HFSY0<jD8g3yf+Q)r|8-`Lq
zXDv2dGkj$`r!bM#7{exWok){_pBe|WWIDA^A3dLS-}!vGc<OjRl;5W%OW(_DnS9U{
zo)Q?UU#qPF(Gl^U^dBpU$@%d9ZSH{bO=IuXTcOVqVkV)VV&A<`Ju#!b`~FF8#~4{D
zUBSgzQCN$%=q|e)=s7i2N-`ZSx3wq7MH#)!Q+5`P(kn4+84G{fo3>0B+?c9j{@A2;
z9{8yWk~qOZ8pMUL#=>qN7c!Uiq2OHYnI{fS>%5@5bpGYvhAQq~(5%n6fK2`JU*fmR
zB22b>U=^49z3+C`XWDC}Xf;OuTxyN^W1Bc$QAx@9sbh5egnJ2>oBYMr{0d*j&3D%7
zBwLw_^B1Hs%AS^2@=tjW0F&ya7ll6?tPFiG%%I^wh&Z(|XR^B#P{fh{v3_1x`*ikV
z`x*F%#o_7;l0)K>^^9?qLM>6HHpS7!X;g;F>>Xn-7$-$lY^qXd$V_NVWEkjO32^?I
zn|rmryt|_#BJzA_<lgz=>SjO{TF*m0#LZe?MJ^?oc^5S8VCG80wr*O~9eMJ1|45nv
zKy{uS?bc@}H(jC7+&LQ;jDNwr9oAaV)1w@$Qp_Ul!<QtSdFip=^!8UD?3&Fn;4QaC
zdn1e$L9Xce@8ZgLGJB;H6P1%epO#v4au(o>VD2^ta20T62B?e<H%V1jVcfKD+|uf~
zOLGvS#<J2-lWxlZCUjti3)m`R6a4sVbFC%Uc6V7dJUnbu%csN$fQzTC46e5{j;7!M
z$Zcw0K|q~~ce`(($=0s4_kx_BXVLfXDa2;I_17PyssMB3@+}SY?Drs;ho!Ngs$;7E
zAPtwa=El!5DaehIk~geyk0~XdSiSJ}<hwV)I`iN1^nkGjeGMC@>p0ZYet+rGJKakF
zHhjRZNjHRa*UYJGbypNZmd(vd3fVaMan@GTszB<r70a{IF5J8dZ(l0Qo?7j_q<Yoy
z!H5{@cK3Pn2*5FY5d7?CwpmdMOlkQ2Q%K6u<8@_JYTO@Wa@{JorbpHkJl19VeyYBf
zhP;W58}$`O!1oQiTq|z7tomSe>-g{Azgr7r6{C1l*^&yOSQkZ=UgcbUyBySdN7;=A
zmiGsrv+hHM(N{RX)gL^+GGFiZfb{TA&bMdryUfMc@<MGTZ2Q9ntA00>5x02Tz8ze0
zU|~P0yIx|+mmyU0W$bz#J-Es8K5VUYxD7L0BQC-_DWVa__lp)*pv)?nAkduB+2r-v
zOQfVZi$YV2ca-u`Qqy|@S5Z?B8e^S6B(R_GjZzEJszDpB(#C<~zwPU0c+jwl=x7om
z(KE@5qjr>UO(Rl*>$2^QT`kP1{z)%09>>81Y2#<}C)hH#KX_;{GJ18$DIJX5{4hfz
zkeg$?F0g|d2h^Sy*YPcv7p*?Fb6+g(^+v{nRb9kYm)=Zu<o?<=y&oev#Wf1Ae|Yi*
zm~td)^Jes8l)k;W9@0`fxIjY_>^A%l$}1vH68UQVip-brPvpb5qD8L{J;AC|UB%wr
zE$puUTom61Tp;OA_T2GQd)L0T*Lsp>mq*!Fkjy}J_L1eA3^=ADxXX95ZTr8Z=@l8=
zmtP>0(uZ0f?DfRHKIw+rbTYr64dS^?R-2jClS<@vI6YiU=pvcv5gSUgO<|J-6qM?5
z3tK8M1X}WNoZ4sj^H$gykmn~4qupugzBnV20U9&i&Q8d-!ySI6b(&GJ=)osNMc1N_
zHvtd(A)w%jHW0h`O9YaQTB6_PVp9a<ZV#(`MR)`vad5;k^MVaUQ{Yq^Sc3RGNgv41
zG7`MUG_cwx0{QLGal02X88~tBVxMjn&+>UmGsbnP!IIDO1~;7UP?|p-@ZCs<kT^7;
zIe}uRW;_&+J2se7`#Pqq8Sq&U`JgX*s;!0#{IT<xRV&fW)+;8Iln1OCM07-z7v!`I
zCj0o6o7YOQ-!YG<LzJ$i>0Mc>_2{x2Xj+Hg%4?YpIcRpvG}D9fYsoA3su`gb*<UVv
z2oF(WXQYuX;0s=+ndVjkOzx%Gnu!TO@it2Ckn*TTlK>%%UQ!<1dtOh@!ZEeR)A&u&
zwgUrL$N5_PeT&hX524B~8Ug9;U6*4Pydt_HZFBpD%%EW*A)^0%N42X;`?fxTGszjI
zlo~BA*%j_Xn?dcN1paVNv~^mo#Bc<V?{qu&m7sdS;=6aj8Vvu4N^-$k33TE;=|gSj
z|Ng3_t$KAd+c4PqpksgAEA43&BMimZ>*>3uA&@^d2^GnxZ1`||U2+qiUv6tl@||@z
z+4eFGUxsKR-+&2<HM7+!V^L{TSvci68if%HC21K$RrqOU(AOr5sGy<>mtvSP>wk$p
zUvuZ?vbjJS_;+(#lwRkOnF8NH&ZZRP-o+;&|8l@W_@O?&WOH=tJ4Rl4_B|`#%R*JI
zQkzF>z)NGxAF<a(?=`w1iu+N*_cX6ed96cB?;BjQ^@bsM;PHpJr_Wbqp!nz+cZG&g
zugjgh9@HXpmZmpk%|M{XE{C={A5*r1I4L)xrkCn#1?W169H0%%Xh<(x%RiUztRLGI
z9p<JwBTY{<n(heGLR35$oE@Ca0Om#Pq~S->YRD?_%kd?LSK{#Zo@k1WyEj4?J~bzY
z-b!1nl=_yb)4sT2nbeuazsO=j-9En6Im>tC)XV92=bBFt<#(QjV8z`|noz+G2Ygx;
z>LCU{4O#a(!$Y%tJpQEb7D$T3ZH(S{FrF+bDi?illrS*xz}X#UE)KHs0ZNJCwvmM1
zoxMYOLo>Zwas*^<Hh?H8zBnrGKG{BrJ{s;W%}yV<4v9SZ=LM^tw$#th<uSijRHUwU
zSM1Z=ZMTjzZ~#k89+MV<HrFj^?5-M8K-I!_E-<(4X(d1$di>H9{s?-ycSMdpCf^fe
zN+1yCOoumpfm&2EBu!?FTwtadBNYVk%6Fy=iwJpITE>QS>~n$*tCeRn_4+DZORx`7
zDi*vZXh5g-bqak$wE<Px#MLeplA9u+z(b2_?jkasvHY~67!3{Tv>}8l3i8DiNZrbX
zK!zKBA(O`2(D|WOx!IOlKfGq->H{kk;zQJ~nGIo^@yS)x75d8ubqiD!O*sySsbnS=
z2^)$lg~8Zq;u>7f8~ZPk;WOWc5S>_gD6g3wS|P|YD?i(GX!F`H_a{A*k?z&B8Tyu>
zAnzgqv47}BA?0J=k{KJBM&>Ss`xTI$KHFu$m{irX6=pHc?U?vHF|RPuK)OnteJjJY
zk-~W<U<s0Danki5F4PXTkdz3crhwIfXwlf*+($$rz5|9{%Pw=Hw6HGA{9?Yiu%`+A
z#1hNHa{qoQYi8%bgsFjbyRFYyXb{6Be9#@XT_+hUqvN1i63QEWb_6_B7ZG>il%|p`
z@BaHellX=*)8to6^^Fmg+>xn%R?e9Prr7tiuB>#i70^#3db$QeqC%53{TW+#zn3oJ
zSXuP}(v2H;Isj`^97J{5NnM<iF7O90obp&>l^pzS=aO{cb7civTxNK$jv(M!`xTTC
zYM+?F-+nodHNmgFB&%JDxe}6OY!IX4T3701Qsp|-n(bik4=I>Up-(luTaT!*c&-Q@
ziBhkGz_V;5M-zON1oS&aHG=;XtE+(lnB0HoctHcN!Vhi`3viOoQwz3nO9sU$zGSK+
z$7i&jE%`ku_FVdA<C}8|h8&b>Yg!ZjeMu5%$3qu`6$YZJy0ne{KsvX4mbDXt2NS48
z7AL-VDTEpEzc+D81dv!vw;QPFzuyQibNKdBM3KsqT07`h$18#Ny787ojOwysB({lN
zzXop;(0y`ydo_5<uLF!Zxy~Wu8ptk)pEeyIec6IK<2}@isiN%r$oO5AHxc(rELN2E
zDeFU+>6<pUc-kwBtxJ!rzqP=%AwhsrvNL$85urO>I%}DWs{7C@X6aV>3rSX!Tn!12
z>;!&>n3QOhqMy<M2>>5>xWix=*e1;??Dy{M9PaG)K46D*+1WjL;tzlAcrD&rX%B08
zwSyv92KTl2o~oeU82FHUBmtRQu;4t7;vR6bW0RAuuc&bw6{{UihL%rN5|04`=bQ`i
zSw7)tb-<|jUbIdhoT!v8wr0<LO|V<FFdg>msO9!>TQgkpk$LGFGB<E{j7LO{0obM^
z%QyG^_di>Q4lGH1i)99I)42<k?u{TPh}Ksdy(2|KVp{>3mo;*4<WC`cW$v(pfFqC?
z6cnVZ*x4)l6+8W&S!-F)=z;#}B`S|6m-^k{vw@OWmR&}0`J4QE(dVA6eCUU0PV?t=
zGm2?bK*!I^p1|#nR$jwbU2mF3P3`JJTm0G<y=!wlswLxoXbOQ|TTYvDW`A3`HtR4a
zWV9WkQmHpIAJnkib;_l6db&vHwP!K0AY0ubb?WL9-MPh7o6GJKzWZ`JT$S{_g`#p+
zH5M-UroC{HI5?TdB8|GyhL@=K>D%dbvVxX2xu3ZJ#zU>;gUD#U2d@X+*i&+ZoaZMi
zG86x-CxhH<+qoT6)el)-B<9T@$*ASPwYA^wMY_=Of^<Xg_B%2Z{z-4jqGw}s^qQfl
z9sax4^5k~zRqMMf(_t@%xJ1|}>VJJNZEz&K=;sNxHG;~-A@4X<-mlDb5bN(zPj!R1
zJmT;%N<rKFQ*Wrd$7;e#iku6k^$i7H5GjS+K-<X%T-U|n=IY8gK=iC`cE2Q%>)&o$
zm58`z?mjrzG(wy#RT(ICwNe*{Er<FrjLTY7>6+D(-$+y=UKdk<N)pAJRloK1(Hi{d
zj|-1gEq1sm%?=jjavT4>xw5ync^&eihq%@ce|DnPUFI2sT08<ZO|v<0Grd57<2MvR
z#RdzU9l_~~XYB)ww=ZGwGR_d$sVlNV^{9N-_l}oG7OIW5-_ENyM_t@LxecV`g$G^U
zmJR>joM?;ly*IM#yyVS7!@kEi98?RA>J|6vz5b7as_n8hI1>SmGt@|?o9Or)&Lv*~
zaeioGqV3p0(Ja9b5}qA?eXf1wx$5)Npo^)y$>*@b+V0hle_Gtw!@s$J=f#o1KN+C)
z6**O?`*QUA-d@YKwC%^e?o=!k8Tpa4-V2?y6z*QDO14tD+`U_hGjRP%H(SKRmW4#<
zK(DV_p}yGd+qES?1K=OpDM186iatdH@m_Yf=bUVrfYhRfnv~E=GkD@L;AUGHD2|d)
zxL6gux9shBkAc?ptnAe*z#SKTvi68-FCBqF>v{FJ>Kat!%_0$Z`O}NlBgd4D03m%@
zmZ&oL=QuN2ZO#<9G+9~M1OSxQ3b#YTJ0n6uarC(|!RBq>2CMX14V|ERlqc4z`Hy^h
zJ&_4$D?^(4r%&o6$ZEHXIuD|d-q5{O=6iioQD=YG))uM{KJQUab^OZmwDKE^Jl@Nk
z2L_f-wxiaJF(~slD6Dhv?dtI6tGEVCjPr#-i7bOw^?q3ElK1LXkX*IDRKJ`uLGYU4
zm@+58C7G5y)~(Aibsuci_29TO|2{b{w`^wipp$$K@bU@Kr>9$MsM(+2<dajOBIOaC
z9grW=0b@tAztz6d(1nuPU9Hl2CiL<bI!2a@y7O!4Yg8Q{e4A}5%kB&o6<Js)VI3V1
zpEO|g{)&)PAoPTL+jBPyRXEwE8WJ6vI8s?CS<9Qoh9Tl;1rc{F7$UTev_^lv4DtmY
z5*Y0vDNN`<ol<}_(UJsNm|O*Kb7fU?GI&MaFlSbQpQ#UNHiRl&;Jp8paQQVRFME08
zA;HF^u-vTxXKK1<hkZ^52G9~(@)Sx!An=`XkXX5st$<gO^psNiD&y1F9Bi3uU5_5Q
zuB!73J-$acT2WiV*-n<w8SB?83F(X|U;HKCb4L$%tixGkf@}dzrgi18tZEQ?HZVUo
z)Cy=uRAARx?y9NFvd6-EWLa1m%%x3uJl)jC@j-8EQHHWW+;274rlYAqFCRtoM}5#!
zXJ@F0uq*bxmiN22P~9YcN|5}#vc!ie_gnh7A|8!dCm0a|1E`Dj(Z@Rh?Mv?1c`+~z
z>2U!+6VUvi6oURBTPmTDII0T6iw{ALs+ptTEUo*qC$4yI<^SgYU~!7CG>C~^_`Tm8
zwW5wz6exiSYAkZSYEix%^A2}?`!$G#Wzd<zGuqxIWJjPeF67>4(i5$~gj0?hPQ;U9
z5XHQ*pyo?Bx$3~j9l_I!bKbNRZ;lSNh`7qo9sPUsM%}CEw=sP05>AxnuVs2x&ycp3
z_$J-{q0wl>i+jLEo3FruU4k(=2K~g+bkN(Qwd3~qfxUZ+2+4L%Td-rsBKCL3$Z%W5
zZ`s=6hR1ngs=+D(g$L26y@KaE3HJ)<j>6j0fvJO0uXIGnvCyQQT^I1v?OpNZW&+|%
zq#4jyKmKy!M=9bsbM)y{aeUVaS9jV}4FuIJ!WIFPiBNc@WM7_{Izw;q#p%~7-`0-Q
z)jWjX*j;tCYeqMnFB`+XWCC&`ixU0EhShAJbczG3!)wf1f5Mz~<n3h!z7-l0h+Aw(
zM0!k}qN@c(pK`Nt7kB%A&0>fO>qNOxal{TN4-o}~C3XCFAr~~gC!u+6pwvbY>?Y%Z
zwg@pUN$@1_-zWFZHjDpLU%z!?P4o<QbwlU46U@heBFH2>e#&qP3PsNl_uEFuTZ&lA
zgmd}5%+AJ15eGdFw*SyOd{Z=usC5P5%F!dB0M4@=%TFrFe1_fYt&<pja&Zh8osQNN
z)1hMxRtTxS(Qvytg*7TV^$=3$!>?P^Adralqc^~ix2pyA$Q%e)h@0;s=?$W>EUF++
zm_+KgR$eWZ%w0wCjVQUDSIn=R5W_vwB67}eOx>%4XbTDx^YinPUQ!MI9pKS0(-XdG
ztnt(4@9?m`o^pAOIR1r;PGhw4)pl=sZO_rr*G<%fkJzAAJB$6*0cJ%gtk_iliVW{V
zox~9)e<|h2qrk3?E_}tmG1ja>6<V=ewX~K%+oZaOTE-6Q`<;lT#}HPSaAg(Zz90R8
zmV;NFl-Sx#oOE#Pty~9xZdXRu>sj<;RXQC9*y+%GEahY6%E5V1VkrN>!hYFg#+T6U
zLQ8Z5f~<xi|3Thy_^Y^N;Pats9N#dqUSyMN(bSK*^g$V9Fzuc2I<an_4qJxR^*||r
z7T<9ANay|0b_Kc1!8F`oFIq8ysGfE3vdD@yS5=h-X4t^R*A!s}OcW;6ET)EMzK1L2
zg6fgSw8`};1HHSuyDJe<H`ZA$u}cvP^fjSWz&Tj>Pj+o%q2B%Qji0_QK;BY8&{sVL
z->M77b{6&qe@W+fX1)H8-DX(>w>2lQJLv`bysp?xNMu<%zjMGyvx$Lq^kyAkx4hW#
zbsb)RH%MJV1&wKa$}HWKLy<KUJ-F~Q@3-ceCxqnyu37fZG=<2P5DHLUsn^8^$<C|G
zwsaa4LEg;4bsWoUmlh()W8F!b?5>m+_c;`u=)l$wVz8M!Cpx7fvk#s^TqMKCeB!+|
z9kx&GxtYrFrc}e4)Fx11K4SDrFnvI43~1S!v-1yu`Qf!!K(w%)?|Fne;0X<~y%P{Q
zBvw~l`E56#J?&!P;=CKU73zs6y>-Co-Vf>iy1aZ~e4m^(mz|rP@5UXr2*G}Wrvi-|
z=xG5Cc<LJ=Y5|c0BgD4S?97TJyV^UPgJP%z;7HR-{ql^{f!xq+nxKwMzx&k8D?Q}!
zbaC?(%D)~O6&k8sXn3*KeOeA*sC^Q3a+Dh#VC(Y0tMX`V?XCR(I6Cims{TKYU$T;L
zGt0iAC`v{)aY=SoxgvXA*GTrP#5IfTtYmX#-77m;ak<9j8kZ}PtFDo3H~h}`=Wh=W
zmvcU!_jtWt&ye77CYdt%r*^mC`pl23<sJ$FrFI3w0B#$20pIjE9PVX3wd`)O7mKXA
z^6Gv<#3GxvkN6_5_OuS)JCwp=P-*6LRu-&!{5*2)6(6erxVF0Da~fWB;=wc&P-T3F
zk(ru4=qJ&ndP+!+sO(3$?b)A>6`GWjeSocB7tKUZPn@)_>)jNm%GcS{TK^OpC*+6r
z*P9o-!RqYPHC2HhgLNO%)2eFV481UTlEAwg=<A#h(J5+SV6uhXT=i@wN;$1!*Y=Gd
zjf3^7S%aQ=Mw-n#aUIv(w$7`)>bf8%(t+)60S$1|q^E_5LNx8nZzP93GPvw>HG1Ec
zr~NK57}r8+ZK_3cQx?*hE$KBo*r4fVk*E8~Z)<1)AA=G8>u~v4{SNb_g#m{>&&#e_
z?B4Njfv|T|jsD##WG;{V0hh}*D68_9y*&=M!Ou{%NR*}KvlNSZ!0_JVpr|~6my_~v
zFzIgd$jx*)F9Wf!?BjQW<m#*IQHriXY^4NH+3lszuo6>N0xBL1(+GG@i-u9IsXtDM
zLs&L$CTpFLS0WGPjjgqM1ea2h!loH;$@nW*d4CrM)0$f*$`HLlR*!5f9^^{qdTLuH
zx3tFvUIL&PD%Yyan4>*k#ER`dG<_jZBuU#aP=>TBnOL_M7uGTLP`Rd4EGY@x07Cy4
zMa6zRC}>s;nSc7u^X5W>G+ORqhB%Y3u(yU?5IgIJs`LFP@a}+n&w5KMbD8^;Xn8mN
z<0lQ9W3Jn*F177M6bc_aISRVW%@=v-ZA}yta=tL4Ub{b7a@HkF$$#!}*4r?kbrShx
z&P=yACE}H(1wh&unF#fYrdbmQnOV;Mc2lS?RZF_Kh+b+cH6#hGp4N@At_5Yi%pW0k
zam{NhW9WU)u9qOH%^!l@FGA5)2-;%4e})8#QOwrVzr)iuP1j9FpI_&ZbdqA_>1p~l
znN78cPCRX_G=u=aAwB_QI4qN&p300~;3!}%2WF7cJ9NzA<cRQaIqRdn(38FCr*IUd
zmGzaBHXSGn`<g%E@US~13pj^iGhs0NG|-rLh6mu-v<nRgeov-%|Ncc3kY@s@Q+&sr
zQK7)dB{+|K6x#CG!ysw=$0W-`z;V6z_jewfjNwiV^YCW4vU)`D;qLZ!@LIMEd1&1m
zuWWP!^f;RYTnW-Yfy(e)yr7`qhJbr3dU=~SyaHW|Z72~%_^-ms!gmE}IDh)`8k6yg
zI4c8JFN~PD;%I+yd@pcR<P9~{KB-1LEdJ~Gk|&j2YrU9j)CPI)#;hbAs{bn!o0B&H
zd+zC0QF;F5FI&#SZI)s0oUEDG(t?I<!HO*VFE@Y+3CQ4@{PF7SG;Rx+=u<|%A<e4C
z!>T-wI3qNA*;&%dP$?6mShHzbzuR9kGR}{)7(#dd9Inj*Ib|b227mMZOlz%y2db1^
z29oWDD@rHSz|WyxG0?QK+rX8`@+hX?fF6O`=`-Oefr?MaMqmAzPADZfisfKd1Y$oV
zejqf|8ldPcpnO*o_4OanwMx&itu^0z>c}+QUV9@tw2_U$#JzGXo+-zcrGC|`j#*ab
zs@^+>mUz7dVF>${HqS>R)(dGBD4LgUZbLKb&kHrewTeCX;S$|#c~f%gR!k&M4z7{~
z?-E_YTXbT^6Mbn(h>^vWOLyx0S?38bA`Gu~XHoPo2<DFo+#;0Fy|FjCjzrq%r=+Ip
z*$<~CNQ@7AbtLws1HUe9F`<OO_0}DGjutLKOTTRudo%NG+mb;h>qm&>IzMAMP(29O
zw1|LGWnqsFoQ@(<JY4i*tA1C2Vxw!{cKQUbA^UK!57xVRO%$9zN_OB~?;yZ#pld0A
zao^g(CO)#otCP9xN}^SznFOu%-n!EhYF$nvA_;SGoxDwLq#FH<XkPshp*frON6=y(
z(XxM8A#(Fnev$BrdVf9(n3l@hDtoJBytdYFJeq^n1UFwc>5OuQm;Jbnn|N$2{Bdv5
zu@rOk?CfuqV+iko17^XSw2`a(em21^Dh8DBJgRl$^4%u?vo{IROiXXTW4<5`zpn<L
zqr!aNjTcx!d^}ReSIUc+cN%WupLiO>F^`;K1lvDV(YK`<#pndzFg~oTl*WcC3OcT5
zx2*gND98$txoRM-)PIOKiK?45)n^!Uv5q`FBEu-az85I(2GX!upGMD)ea>2IE7SFR
zDY@(tP;_gY*>BaZn>){x3y(X6j)9Al!$F_&^B7oAFah`JeuR%R{7*rCo1OHsVkXQf
z$?MC*T{OlImg(Yw2FlTt9Ry5M=JAdDwLnjbLgSDAEt_c6!tb>Cq7b~HTPKM2ksZiH
z_#btKtsYnVXC!tE%zrwdxWUT<>}}2AN;f4{Q(*;V>298r!anO*STC46h=2zeNtE`c
z_~X#rTiP1C;xykX>W2FjtS?WsiqZ*-53qu!fz_tt!4?4qA8M)^a!{lbvZ)3pm%=I2
z^_Wca$cTuG>bnt9s)B8`jdwIvo+2wAXggD8T#9G<c+X`&v5A?@!OnzU1&8E{*AZKn
z{rE3hZ<AjA-Uf(Dtdv$`ytCZncbMTFwX5|MW&HZh9j{qF(2S=|fEa{)vY=C3B@%O$
z-#`K=qz3mhZ=k*XL~LrfKXC1?`~7-Pu|zxgcXwyXLTsnQ$R#-mY>t3B=%bDX+wJw=
za)AYxZKVR1O4Dl^0&23%iZTO6><4#wQC6d*Aciq;L;iCJDsy6Bb9vxR%;E8IXq(-|
z3%1kfmGzDbGF6^ZTr@d~vGzPqS~)I8OO(r_B*bQi`&jrwbSHAJl!+BXTIAbUM9i4R
z?JUq3*6-WDB!aujnjVnAsdb6<7C_Xn+c@`CWA_%kpkbo>1dB)2Z+;(ZSt<=7cSfXd
zJ$D7>>7onaa(6s276z4a`$F}V9_hTD^EJR+b**WT5BnwkyG$yZ*`PA&gz|J+?B^3;
zTP$Q;ezgq*Dr=>sqpv~VZWF(xZNFt9nsq4tW2U<)xwk(2W#$E$YDb*)y?m!B2786q
z3lbfLKilS(^6xnH?EF#Rz<?{?KlPyg+Q7TJ4TE-VzxTCWbX{D*EPk#YacCQBSHUiw
zoHZs~c~}VvoWhzV-s1<2J%7VC%e8ZG|Iey1QE;WR-uu-9Qp{^2Q^<Y)**lEBZmst3
z5A~8l*)p}>+Yz03I?gz_2HCAOHEHkcgBPP-3%D(8vaDr0t~<^^6hQYp+eo!|RaT&R
zQ;hx!)E`NY=+p{K-jE0Izn&|y?5LjY6SH8%pL@kc5sQV!Zunb|a~6Jvw!9ha+!7MR
zarbWKUFgoyZ6NdyPh-W%6P{8n%4EkRWK%61wRb%MX|)Q0$n>%oPo+MwyS2J-4|?^s
z7E@8KLA-8#jQzlirhbwQ(?~918rv6zE_~^yCzfHj?g{Uv4>X469n@1MauHxg;jLG4
zfX-3+LRjG1OSS=gd_3m3T0BF~GJcBs*1W~8HZeebN-Ji#U^bqa;HmNY$xu7q!01W^
z!3w3(9eF>a)f=8rh0cW0d})7*CU^k!#QMazWp?ej-dmq;QF5C{-Ya<gVyE`pzCC`M
z1(+qmjV13GU4X>$b^wu@BH8#G<iM8Uofus{1=Fd#WPAcCOzYV&v;EzVW|W4TVvocz
z?)SE`bJRXm9Ru(&*VROkm^G6Db3_JGv-=Vuf{7q|`K8;BJb!)XHFFdSt)*wnV<c8x
z#U1SoD(m}s^TK7G8<K=7uRt*;#_0fceug$%MKXMFHQ{Q*7?&3cMbYXRJ5byB8>+=w
zZ1%gghU~8>V6xxwz`!%^8Svv=(9YjH1$q@kll<vt%w#>QCra+gvI~N#A2b96p?iTX
zD|u9hZ%W<FO-kMQP2$DtGdK=!uV!b^_0lSH$elZ1Go7I6;>hneL0Ni1mT!&(Y~cMu
z)pG=1kZ=rp`+dkVVE2A-?hCKV7spR>vuw&cA8Z1ciyP;>fhel%^2VlcHablx77u}j
z(PIdn9?>?(KWt@8;r`53W1fwJ!c`#0Tul*~NHC29hYT&G5)=k0rtBRro;9DH0<sPw
zkh}^ExARSak@IWz^-Wf$k(x;)`aHPRsG`we4#Ni)rqzS%m=hlh)Sx8n`Ivum!>QsH
zi&rH!Z_MvMh<t<kMLb)7>UY-gS%7kIHL{R>{|J-+yW_N*p`W-In{R3M`BCtAZ6)W!
z#)NXKYK$4MBWi+`S~hwwY^!y!-G+k1j!&Ep*%rT$VrZ%64s2mIRE+#Hs$lcTB0CHU
z_h$fs8Z;#=r#1?8zb!b4Y{8|3!R+0q&)3u>qJ&&Q(azH2nqj4OFplftz}Ww}az+}U
z6UqGhyW^@BJ{6EczF~M(@q62s_qY=siv$#u4HhMkFN!H3dz?8qA|zx`iKrl(a+l}d
zQTV=kiG0-2->J5-gA?G&Ld=ea*iQF_u~no*xEV?o)2P_k12pQJ5>kX@gC?#Sq3}P3
zMz9<=N-ZbXmYU2BGZfm=nyfa!%}K*g4HLAM=#x=db+Mkv4gWJNO?S4`1879ZWGIQ~
z!RNT%CY{w#HkGzG(E#~zpb`GNHQV2M8bDC>IxtlQLhkl>LPhC(zR5!L;7UAB?bJ@U
zf%h`!KBPk@18kusd9vUz6I&j0{e6~$6HqjIYG<jgMnLwJqzr>lN?cFR@~a?ea?H2x
zu-O|1$Ld!Sc%9a%BvlYT+y)qU`=ds_zC2g)>oe$sJU17}6~iOClI|}}V6`!jZ_74%
z$_FbuH64cy$@Ktx4nd)8U~-UdhDy%%6eII~+1e8Cd3n=G%d;AtuBHa0nfKfV8q0IT
zwSAP<xs)X6I6_z!EU~6K0{AQdf{`KI6D{e^GOiK_#f7|L%yLN<5lp!cuJ7d*hlq2G
zTRZCxDUml=?+ptVdup*f$L*Z}PjW*=)rg+y4Cpbz#co|{FvVhb?^p_zgKV3rce-9w
zr1yPxOEQY)wn)=CH#L82sQzFTZeP~Y#Zj~ZC2@V6`#!H0lgMKe(`uAAY5<^wpC99w
zx9}`jy$rf}Z`_JilhdQpqCQf6eB?`lJ+i(VRRxi(AYw!zj;V5vC-0(*Q0S_O60j_U
zuy6}870*0`jP|-EZrQY`Tm6=s#$QFohfxLcM}f}b19~+ZI`=d5WMJ{k&i}l~ZoNi+
ztVyAIRPeEHsA8dUdF_@H^0SP;1)t*Q8(lPCnus;rV7F2iPwIv1%5Nm?ALo7RWZF={
z6IW&#KAfB<-DueV7**(f(9#mI9#MP}Y3lVuE4*u-W__;v1iM*LWt0)2gh#k#5^64@
zojpox8k=h76gVAw)=3w&h+b<QzimlT=Z>x3zYKzGN!1E&DeHX^&|Kg=V=|<BAHfH{
z2x{tQ<fI>um=e_z<J5$>+jr84m$lss^*7wY7AD&^)JX|x`*MHzpot{!oTo8^m3!z*
zXsD_N+BV-bhnJ%DV|G+Wb|w!1va9xD0O${Z6achaTTk9e2@+E>!t#YhV!s1~@P0qo
zv=}+O<w@RwgqZ1L0JkG9^keK`J$v?I9MBv0@|q19_5S!iRExYI)61(7aqKQ~H!$#V
zkp;iw;8e}$aVop^9~lM{=*6b=n-H`-SDlLt-87qkGP61z&jy2(B<ewGu7UUu_LoT5
z2{Uaic4%0SN%8T}@7CIpD*FznAB4FEap-#3h5Lb^)>^ofR2B=!Hc14ScE7{aKG4bA
zS>?YEAeqI1uCZx#LhH}K@}QE!Hw&($=|Acfz(ZZ)b%7sTV0;Z9Z0-E>*Ymu78(g)2
z_OnY<<SZX-gCg0Y-kVuY1h9gCYOda}DGQNN9r#<Ug&c0%?}(SP_RlS|wv0d$&sTpn
zWE*}ZbnGDPN{&DT^vG(~tlc<#>tFwyKk-5(8mY&$ecE}yiJMDWsoX56OW@A7oDq3P
ziLUErWxC_xwOJw*LoB)Y(3QCxfNZ(gPCNIs>GK_n^s}BHhTepr(CKE2L6NsEHN<Uc
zYko~95H}QV$-c~GGKKQD)$V9!o9o8ZCicsMUiJZ$aIss&zoL??N{6QJYbcP5THobS
zxw%O8;^mP=1_+Sr^V**ja^c~q1dSFdpi8mZo90JC*qiH`S0UuQ<P8_QRhRnm=BYUl
z61a;6&kuUBC|tS7%>c29UH^L%(yHdHU|v^cmH1%SxT-?u1xLu10cw5zgmtk5VM{JW
z{5#37>mC^gtPut3k&h=DN-VdGOUjr>9^ajbnQtiN@g~xGwZb{I8lvmBk4n*>O1}P!
zI^=Dw4GRxHjyg3t!$%!`I+O25BIj=E9&Xf4hsB6Q>sOb0|6~ok64qd9BE!j=p6nOW
z#lNw_vniETw{7PrUUAo%wR^1b0M021LfBmUx@FV9zu<}OTCmEC`0*=Zl7=KS9>L&!
z|LdKs_KL|oLRuq++0|sGY+k{~>$MitLQ`ul@|{A)z=<0sVw$h0-Qd?<iSH5Rm^cG!
zaM{USQkiT+f&y3+Q^r>Rh_3j{3CH|LWnZ57wG<!kuC|C#(5eF_#N*tP$+c}}kc{rt
zBd+JUa)4*6)$UUkw(oZ*NFH!gODhqj6v;`@GOavvh;=5AqodwC7BC(*V7Q}t85lUh
z;lEWI{Y9Cp)(?Xfg=xNMGcEl$mvZma13yrD_4^1&0Tt5nC51trOb74}<Tc70sYXSJ
zyIR_=y0{7+Zw9L>YWXIuC&_3zzFqAAnkwJ*xDSl9roJor2UWRdLmu)*w>2?YF8X4n
zq7&J(VLVQMaP>PfF=BM4rm3}wZ%qChKMuGU84d@c7y@t^PmDC9)C+S$3V5)_P^hbJ
z42(}k=uMn-;Le>K2ZE9hAJUtg=_=cC@f>ZFRVg~%**0_e=y1tI<7axmA4_-C@xKpK
zxjQg;?Z~+=KSM(zB3z!8P7*Ndz0;2-+e8J%aS{V50%2QY;+`xZ(^6D(^#aAxnN3mG
z^a|@a(0Yakuq60HDwcyGJZzKta>_h@qS3CG<MFCtTVFE~*AEu3m-+xReVTcyYYhAO
z^XxgVXImS6nr#-@TuG8GkH3ECuyssQ?{%>r@bE%~kkN@d4Xb%^%iTw&&%j*Jrp+x_
zm2xr=mK)V!>pz4UnXhg3<F|p!91oNaz=kNzK4PCNkuQ2D1au54iNf~vV_GU9hBC<L
z`J8-g`rVg$N@AJ;!y>(A<HDbea_=#LhSxE*cV3CTM>l5+&%L$laav!tnB)(uTaJY4
z^<s8;6U88<oOr7lob!yUekEF8uf5NL*=xzg%G!<b;^UzTC%1sGD*YxAcki%OBSd}v
zHb}v3qrtfNqD88QqG06lvJj5EGo9sAYKgK>qYYLWaL4m+8sSvZn(||9<b)r>3?97h
zh9uc^8sQOn<wquGyC&SNQu}{i<X4Q<T8wSfDG+KgR;8Bnd*`c0YQoKI`o0?HET~0B
zEC+8gQWb1`H5~R9eBQS^nv}Gf12mP{EGdK1#VN7R52(Ib{le?g^><e{jOx7$d!!Xm
zXseyvOo{UIP*1y3@r+pUG}BoP_Z~Grs)AuNE0ew}_;A4@d}lNjE!5%g#%Hn7wisb*
zbB%VrRHZn<-OP*b%R)@7)s`1~1vvYCn#)z&MXOmv9bY#5Z<yoU^6I|djpDZk4=9dC
zHK0Goz)S!DUInW@c()iy1q>BtY5q;M)mAMF-t8gg`qKxx)*?l{3M{WAF2y-8ImlW7
zQ8uMhglxsY`95H?L+DlV5v7q`&GTudG}J)duOBtk4g)VzTc=Hyl9*NuU{lI@nNuoX
zU1tV@+cY;<JMlQNgYeh+78$B+ktV=QUTCy@r<?KY-@iL=61jzo#T_LBUCmy@_%vae
z0k-DT0Rs+ev7KAWbm>;8+zFXa@MokP1Fv?HXC)^xK6{}R=c+YVQoy7TBUo5~yk71s
zXcssi1Z0pyzq;gXO7tl4M5~S_N*RKRic|9a_2@|V+oDy<mOo5?00hG(CU1ZYjh*>`
zhxb@}9BhFBCE3hVdv@{sW1`i!;#Q$7X3!N!7WJ9@FOP2+<iPNrxA+He(X&>3p<2ah
z@6>$I7+}waE6U{md~ctyQ4GSAK+^PVr=c-8;-)c41+J~D8R(3Jqr}Re*l2s`qG-MV
z1uzuAzQgrz>~xbhNwgalG#Jf~Aq76uL|_w6;5e_-x3WwMtC@-s+Skhp$he1!vN!rQ
zsZfpfPswpiikN;haCcxi`#lo}^16(NM5z^*scGAvDJyV2#`BXVy%ahL@9w0_1qD{>
zUq8&qM00>q1ZRR~^G^na{!}0`tGf7(tnxhCiNWV>G`ra=*SN63KJQEJ^o5QoV<Rt0
zieEO|G+>J4Ri?v<-EaqxlLI&#yE<ClCo~|zhc7f!1#CPYEJ$i+3{nJBCQHC+3#goY
z9r)ZdH<!6A)7r1#!=1u2d{CjrBM+R4>!O0@`4@UR3O`;ceEo*=@1O{Cf$WKXIq`St
znVCOaCFe<WCePi5F=^5hrGLyc{h^X$rA8POxM)GQU%u3rP$Ks-lTv~k2j_yysh}_U
z@y7zg&0FTg>!Ie_?I+YI5Yt`2Nv;$UR;04?QRQXJ#V&+*`+HeT(*r3FTe8jIg$lh9
zvqb%c%`Q-&&^0H%?nw^&fg$ct8vTRk3kPj;y4AFoe!Z95P8PgcMzv~F<9)wWgq$?=
z3@-l;dhwQ1(*X-1iZ^$+^J#nZ4427%p>{=6Da7Y*16xe#xIA{`ZeeTYT5Go)Ev9&Z
z;n+zGQ!_DGIp0xEK15$kO;wfOq}&QFu~!>uBvJ1#qdTxk+-U~Ha9dr}Bs!((&aL>6
zky))IU|!As>*r4ZKFHF;UhI}q$|(wnfwusWBPer1I%_(>7LJJBdQZThT?H(PTgxV4
z8#)#r()V`{75QVO4}1K4Q?gtnN^if_rW+7BEN6RTVgkhEwRUd3Ec_w&MphW)&C@`j
z8!QCW9dV5ZOMwEnvJ992n_Pes0cKfJpD3bC=1g07l?hZs=yb!FV$Zh{;Jpa9Px|Is
zLa`qx)NO^$*#>^VzyY->07S!jioN$7%zaXpXXaL0>{iA40n*M)P)CNPJ{;#)ji$8j
z9IfZuWYL3LJ58v!w*EONX5%<g+kH}OSLb2c;@H?nQ_I`#EzL^TILgjx0={hLCp^WL
zeGIZ1C3&gY=bh?Ijo3g6UpR$P9y#S^cf#D5P#=7e?!XY~sc0`CmIV71xITPM$gbF;
zy1YnBoYl#cM~fo?=A2U!(p2ADY7%1&KJHwO=={H>=<iR;^b_W*y9)$6B2dGfD%Mbd
zzo2JhbBvC0mf4pCt!$m9tuUdlt%m(=Sx-FQnN^Dn8d`Xsb6z%Psu_Wfa3F|`;Okxs
z^Ao*Cpw?vn!1<CZk?x79Ft%%mhdkYk3u!OY-+D#U3z_L##ayi7&&hzpL75r!tDPuR
zF;lUO`4_^axQb}cx4J{FM~(`{sjZ#<xbi0;d$=>hh0@)1CeW+qXTM!<MLwjj09_9o
z^F22%aLi%(@cYSkXzzsNCBc+9jni$m`oz(!N3Q$QkN2*prA6OK;N@TdMaPI%yuWQj
z6VGmRB_X<1r2;zHUYki>gu?NJ$tyOw%vG(64hZUB5eUk1w$tCV`s5O@_fL~R8Amgz
z1qZ8yH#Z8zW7?A=9?jjyORG#qSZrWpI=3JDn0vXcaDDmWJD>2CMTMyGV_}-4u`ckk
zqPR}R9u6b7;tYk&N?dPe97hgc1%Y##v6y9ql)QEWE$q3-_4Ss&#NE)l1)!({VN5B`
z`PIR2Br#VZoXt70i&5BOf&O(&No?CCCSjTdkDj2X3>SUoqhY4tId-P`yzS!Bm9)mp
zSdiFH3gvad+D`fCn<{1?a~fE8uRU=`0Yp8ID+ZC=slkeYGFt0QetBi-Qv1Fj&;{+#
z)BRywWfQxaY+;I9pdBXxBUypWY#@2xbyd$d8CdQPn+}lf=WWB)%v1q{@Y^KsuG4)R
zRG*v^w`&E+sOf=rul*5NdJ@JZN2W6aPstGGJS({a50;kk07CA770g+(Di03}4Htgi
z?AN~Fjh6%Cf$Zah1>mHM{O4S@{PYOhKO8pN&f}s)X0n{SGrA)O=xt%VX||&^-#-&r
zK*Ke(%<N9~5KD(ZN<(B=$e={Nu^Z$GUODBXf1a6EOpmW7P)MzY7<-`5R}in5v=@)q
zV><z-ls<Y+@cv4+jmOj4r$Dl+l3*+i_*3%@2vi6;L#>;hd*QEs*s-r3<C4>*9wo0_
zlHO38D9K*`5NedW=p-BQ2V;2?D_P~*zSmvI+5_~M>#g5Af~2#|SR8RmSvNi8u`iIk
zviAT4GumoHH_E6I2tvqn8C5AW)0gU#;n;D#kB~aICN~9ZKM2HnPb8tPmp-=z2|Hp6
zAL`FieU$}r;z4jYM!YO9j}0lu0@!6K4bLOC^7+bt5qD%|3P;O3x7v79Zy%Zt{UTzs
zo<^SLAT4Zo+AK5E%*4L_9($fsggH?mdm*?|=Mubgux*3)OVW!oeM-jkQ<;lR^{u8b
z4;^c&di4yU&Z*r$N@~&f403h)eh(#GGY9GESOa)07QP1!`66W*pqZ+lA=Pd3`#xLw
z_wWpwA}#q6k4R0ne0qI2uU|@8^Dv@P70Ne<{jF&(h+PHO;?6O*q}(RvbI74-UWpRX
z{PTh0C@le~J(Opn*|;pVo*8nDeOIGAxYy(*dEbzUQ~UXFslx2U9w^7+qMtRj$G%f`
z{<p<GnZa};!v2$n4YB)={yh&=hM57u<$Sb^u4&z8@!+Qn7;Mjh(a#kAP7gH}o1W%b
zbhm@b<{`t|mxvn}&dc_pg{BAh-0&`pN2FvOr)H>UrM$r)%30XbV@z`W{pi+p6yk%7
z-*D|9^o!9<Yfin_$sP8ck?ws3g=R}%A@9&L?Wtz^hg+GqK({-67Zo>_WE?o4wCssR
zi{3B$*}R*CzjX4vM2kF*s;){0peHD2E~Uaz(zc=zDnsrTnse)<wQQQ~cCNR~1Asb!
zFAhKO5VH`X6;8n;!p<t@>MwxMXn!6Zv7di`Egw;ZB7tP3>Gsy&?1R}WGky49;fnkM
zZS4P!{_#4^hhEmq)#J%y&A(;_Hkd}CR@2ELy0jF=U(^qY;I{z1q#P(0<Kk-h!oOQ#
zbEV<>5sL%0{-i8epvY4%l!%{{ROu7_%dty%vPsmxWudmtyK!UK=B!UoyE2~u3V(rE
z(q~)$!&HA%f(J8F&%$3C>3$M}PER+014}a4`zlp+RYkHV0@z~l<68wN-2I=WFR18#
zaahy-v(uivjWxTh(<|)56Fz-Fn{7Zt|7PBlzD_=$y#4cMDdzZaw-;xV5>;u54kdeq
zf+0%#xJLh?u|3=MR?5_WK&s`v(DNdX??1p<8_2@Oso(i+gp8^Im!y(JMgF2;2{T;Z
zC<d7l=Op1`H|^Q|{5ycFjLCJe{OIJypSTdM&D4yQB8RIMu=dz@l!!~q(*(VCeAI4a
zWLk=LK72th0^6yjH$}QjEA{?4JY0)dYmEj`a)<8Nu+-+O5yTb(hw&jiPEV%M8MGh~
zI!NmiyJWwX1zq`M#kR4%(sUPFxIV7>s4a&b1iOLrV;SExEaGhw1Q|$2U(#|%&$d08
zx-L>@z7R4!tlC|}Y?OXwKES4T7^3&s02KiE^q!^Y;rt|Yx=CksHxxFj7&*DbMgKAp
z9_R+HvpO(|)HzX%?9BDy`jx`8kUpuqm%uj*;{;j=xqYt?2OAexM9KMgDkU|Y0%z*d
zTW{I%4S$?qDJ=BH|A(F)HDN2Bv&fzeDTFWlQM~h5e^S+rSu*)8O%UA$iP+n_ert?$
z>99->LRs<^T17HvQB^H`2>l#%u<%^S-p8B)r0^k@#x!&kM2B#V<aQJ?Dk<v9y1eOj
zBwkg&*gD|zz3mg4!%`tFFv06lT6?m}!mCX`s$5K0SHi!uBlHD^=24FtTW_=;c8(L;
z6t6%ZL+xpLBMMz`{6%xOd{N=T#W1n(t3B=?o01~Zs9B^J6V#jN_(LlVsX85jgmuT`
zwu(#Oi;<%2VsyPm*?I?@e$mbGZ5j`l18oj)GC7A5)5=EguKgti#dVYTmV<3FkFZ&b
zxbk<BhO$4S{#6n7tEwcG+{B+JVOv$B!n~`nfY#8{)5*a4qHU5bO9W6}e_m|%8MTs#
zrK|e8aTH?o>px^mq@Kq}$jNsh_A%Aqx!fW}SJRw<?ZAi*TWP%Nqle@f8G*P+#of52
zv4f61@h9Gopy8pRj%7=?cjG!&p7WJk{O-YD1&?BtZ&jb?aaw;?Pst1zp01^6ubBM?
zG<(fvJE}%I?KKoyW<W#65%H({7IwG*Knky9-XX&xn2LQnQc4x*Skxao=}1kPtpdTt
z%C5#rL>bsK2l9*`N8|W)P%rx^Sr(F+hPpNMTiXV{12|-8INvJtyz{K)dfRbOL>nX0
zf9h)*z2xgvoQ2&B?GP#Q*zP#M1%NWUjL>L;sJneJP?7rY5`CU}H&d2;*IK>#2Vl>T
z<Jk(XkMym*6W5i+LH3Y9Q4|XPfHHI!fS1{ow`9(Frt0dgsPYXvvY47v;w4L0EC9&W
z%-(|B2nsE&OI8CblFyF^UJ2}c;50y@H+XpGbvB?RO7Oe_7b$kJC0p{#f@8l+X}B1?
zF!gs#e+!syiZpv=o|ke-=;)w&=BTCy-dvxi;l`DeVlL;!+U8qb|NiqoxFCu}0PjR~
zyViZoJ$4xm%L-hJ9!)+L-oO)mzywV&)!)X&v2%WTj!hVY<^@}#TI3%xxEJ?+lLj$0
z-q~rbDb_Nq$~AW-gT5Ec(As-H<$9<co;~})iHt#6pyQihG~2$atPv(qLcx5loEM*n
zrwa&#bY8z%&+!6gP;KN~ZRMB3RpgQ3Iu=21DtZb0s)*AfBk-5dInYOA{O+fyBWw@F
zJc>6OnFqJZdS0I5`?1&tq4#flCrIPpA0Xhc>x8;k*cIslA%EU#V-ph8^leQ}s3Xr_
zm9122gL%w-11mTS)A&(WOv9J)NqKFJPjzRI7%PME0LxuTgwqRCoyDI5kollb8>$Ld
ze5zHuBic_`Pl%e`&GPQ=o_bMUGj~$q3WEwE0}*B$)k#3g+h=nd0}CCPrGzAx<_Gz4
z&!x6i{~qefy{4w4h|6=4QDOc*>6d<#B|Mr-ze-GEWY>(mTNISBqA-G)(&w~j_H_n<
zM7c6f2(JD9P^<YGFQ{ni_S2v=zeM7FD4jSfNyO2f5h79ABwY;HwziOOAH;uOTH<{9
zH1j-I2ex?7f|;JarSJtLOef@|D#$jx33OSZ9XUQ96=?c^J_#TVM|wii%v#&+ELNBw
ziSn|>MXu?r93B1K+tV5b+(w&RgUgE3uoBBx*1+bYm$J9Nb#~ee46Grcss-%zm_Sw5
z#fZ~z#odTSmHkhjrf)rsdQVpBXH5l`Q|7s4GUa=LeA?^bf6&e#cL){;kOH5xXY#Yf
ztD5XTV^0k!*f{A)X;yrs`K-OB`ek-C4=>Ud-cS}``zY`qgOJEq?wJH`T^A$Xp4&6k
zF*)bQl>|9qaHb+jXSL^~_Z2=T7|fh!wQ(N(SPsjxt@q-V?r6a2|DW%lKep3SK2<QR
zxfaOzqWpv~%P@#?43H$QHyHHirfq1;^e->f6uzB(@wK4_uLz#~U?~0-y=xa{lOPF}
zdAPU-s=Jw}#~0hJ5Z@%O6F_<Z7q4(=YH0O*YJ1Q6CaV!X*9PI$=TutfB<^V@u(Jdd
zu4ekUQo_6>yV-g%__tAR6~>B1^gfuZrh*|x3oDV|pLaH}2D(enfzWD1!L^4nn!MTO
zb|YoIP?AzcV252YJYMFK8S>sebGw0%hAigPkbryz50pgJ!Tesnlcy1r$J)6|3jn6L
z%<P72^)N`dDkXoxHEEBVYe03haY!6*?@FGLRarGlhz}=+NTYA7EYuRr@VOHT7!<wv
zL1R*kp^VSLLrh+nI2Qt!bFTNBahtI`7%M&N$pRknVMG?0!Da7U0N-^`14A%Y_)JCy
zZtVH6q)*!OwXJ0zXAUo!jeF!lpBu%048qYMMt;RwoCjUMWEQj*dAwbI_OI{^8?+b|
zS5c@QdHVEB<BSsA0c_*KRDy3j-%DZ2eOr|PYCI?Jbp9$xwz5htfKJ7-lo=@*rnFsq
z;WITE3ahq`pXCwV#ffbG8_9n)$0b0um^r*GpFNqrO%?L7AlRxQxB7|d3id)N#ELuT
z3nPjV-%cVqh?Qkhkwaf^z|rvBs+Y`qjmn;3lT2TFMQ4gcviDf&7sba^lKaF<W_}bY
z47~WN`=&cZX+pPTHmJ$R#W8QAjA#B6<7c?GSE{Zi`>TwD7mW6Xd8L1qb)&7`f5`gQ
zbniB3;tJ5aKJdtK0~})(1`HYaZEd(zxjYoe&+hpZ42_{6TL0|_v@ZZv-?Fb_FqyM%
z@^^$HKdT;c9lEkSF#2EafQiQG=NfF+KWrDTAtR`i0K0Ho!d@bvpxng>*YaO;J6t=d
zd<);xlaoORK*NhMWwcf1(m%02PXW38)SMPme^I&x!R7Km{mQ>RioZm_6|#q*d8JXz
z?CeTKDP&^F*Hkod$|39*<5PH}_n?EKqMKf<2Jj9DCG}8&{=ZfFWE-%-LkKYMX1{`8
z+uP$xw;-qtd9q(_t{e)cFN1~QX8#5tXJSo&v4}`N&Pt6clTcCFn<h~ua87qijW$6J
zbYTaOmT%Th)>7J}uNW-C8g-8XY{@RLR+0UahGFIzk6SfJU**CaTbx9BI>C}*yZZsW
z3_IF4@fzc;U@0YbE_$!}=M&>{bEfV*I|d@_$@jdc(|xo&*#|S4#pxWfVPu-72Z=IO
z6~W*Uw3SEOWxuE#TFA_G%Mqmk-u8oD(c<r>I#(QbmwXk09<}5_hB!b6`OJ+;x(-4Z
ztry%vJ%m7>lWrjWlfLT@0{$Gp=;OF1!MitJk%5^UVPawEO{9(9A;sA@5@17w5aS(~
zO+?5k=4SQ6HAAX}=id50Uk5T;I25o?S_9LomPyZX<CY5ll=x$Zwd2k0rTQ(ZAOBOk
zuAKHryUY(;IkO@SJGd~{+EU(YPLGOD?)$vYoHwEmWIBl8(Yl>(t5fYwke(Sl&u8GQ
zT3hw7fNADH0D^{P3}yzjl|6oK$5l^!<BwC+t&0qNji_I^MDCs+LVk;ptlG!V-c`n$
zR$7g<mlZ+AX{E;HZX(%%9s>wki8wr^0*g?=`~ymBKYP^i_8VJ%zzIq@d$M|-6OI7Q
z_{c{#G9zDKzXN&3lz+8T)=DJ!+LmMlFe%l?BktA-39PHccp@@vbhIvHi<E>tYu~Ke
zZcF;nX}a+}XEPYmx$Qu`)gG0Rzc&}lU5vixJu|0}^Ssa@`0OyKtUftRR)0ic$f{bO
zo(o|q#W{XTHz}udF&f&<&xS;9nT3MyNqLbj!8h@S$*G~{E8qLwCZvfoLi3sG#mtEJ
zcAwRtC2h{*AT3=6^EdO24HlprWd5?$L3jN5`wrD`M<T6;s+xKpt{4b_xZ?O{n4)_0
zJ-d<sSn7^zx4vF8eE>h*-=j1G+tOT1PIjsT`&7I1j%xBkub!o(LIAE0xV=7XoC3-~
zV96I89L?x%WM-U&1@=&oo~6U19rBWIMd`fvGT`UDla&YlPf*XsC?0hAQx{;~3hkKZ
z@G?LEey2Z%Xmr=f#umzw9vx{DcGph&kw<8xVx#+z5}gLiIpKQ$DxliEm|dGF^SsOe
zc+<;Bs`)q}jmY&@Z|g^iU4KRRSs5Wf=)84SfYGN<pP0V@JVNYVU|TX%G;R<xUR%mp
ze0SlU{C*X%Sk`UMj=87PpeLn6)5=>~sa1=%tL5RkzIs`K{FI~dSDxT#IbMdWnsOJ9
zLh`R$<W8fT6!>FLQrxgzlQ0F#mCW%zlqDUfB~5A>z{*pWGW$ljc_GAc0O>x)5e;Ah
zDkdH8z!N!P_^PU5er+$5ozD$0z>~TNNGLk0O5`)r!EZ5tOw7QJT2p3x5Li-!6XWN~
zy_W~;s$X3X?tSg?V-lJ7HXm$Rd4j3CBA+tA`)aI@ce&HhH;psgG}?$@!c@#1`gM3<
z4E&yjCVMP2q;%4(I>?)sb@;P9tn$jTiV1-2UQJ84+UPH<GOK<W7~$ra;>zL%kIpvm
zMnQmuEBeanOA<}AoQ&T(qCF!I4p0Ed=^Z=j_Pap-0*;EgPrT$fy(ifqFM#joc#I=#
zdh7sU)ZrYUDYHayaP+s7S8Sd;b|q7-339V{%X#zICUTd~K=xYm1k5o<qsf<o>DPK_
z#FD|jqe4e+^5glY7n)2B=d4sXms0iUGTTAjJ<C1YOH6cP(;?RJ`FfPN&92j|vwy#K
zuDL$dx0zn)xKIC|X3<0#(3b9JS;>mrzIt6Sh4Ji?raqnl1TliXWCNDcMb}EgxFONk
zi?0Z@Rq{1EMBB@~vCyKO`X0_S&YAa_(R8L_<}i=?*KhQ2tT|`jShMTFP|(py)FJwo
z#AhFmuJ_juP-`bCO(M0epJG}KB$M&7kcXJxLx^>DS!Dd`1Iy;Y*PmtW!}>FhsF+Wm
zr}XB|;qyU_XFk7EFG}~eNz+eWVoesDR9lV29?`eq*k9-v<@AjmDrlMM4yf*)+}<rV
zHM6obFxxs#i8gGL-&G5{OX|n&(MG4RYo-i1Ng4H}t@##+o7!fD<P7AETzODRAj$>S
z`;+SZThc>-WN$y55Zl21T;$2#qc;33*mIU~Ut3=Z7)P2>mV1_$m$jz@(7-nWaEo%8
z!0p>CwmdZU)|WV>@OM&{RX@FX!zQE=E<6B;2(uufpj?BNl)IocYNR5t_!8<kxb-Q^
zBX3=iRffG@&p;_|?57v%p+vvCAbO)$re}GdbPn3S&_6v50EWA7)?M~8>MNH3+Nu6w
zAgJSwT#-b~Tf&sFrqD}?h#X*TDq*za{NowquShR1?44>#aj3GOb`oB(%2Pr<>+3Mu
z3ad~f#la0k3X+r_>E{EG&a7I>$PVRX`Sz|7S$t%hA@?%>BTsp}CCj*=twak!mx+*$
zEV$WWXW3SJRkZOhC|X83#+l9N;B--Z#9|ev5=Ls@sjF<}qS2T3u<gtBou%K>RQNnx
zn_Y~^anln0Q7g-N<3@!-eHAD6_Aaikc84?|UDkvlv>!-hc;tEBfQ-}_AcZBFG@%2_
zKt;{xQz|NM49_jA%=C=4qZjEX+~iQ;co#%c!ic7SZ*O?#;sGgj-U=1B!zUm~e4it#
z(F|JxjZ4O)mN}Sybv|x*I0;@Acz>HTJ+ZF&2`%!$pOpqV^Xvf-F9w6_Xboikv55B)
zyrHswYz!^)u*)iqztV4vvwr)TOMx~zHL&fDjz12<-BfL_a0ja8>`0FwCOZ&sA*!oU
z^g@nI#rmb+GS{rU;u^h-M4cN@26_GmeA!M`<clJ5L);*Q>$x!o^$%Qf?`y#DcamJ7
z_j*Z7J7g1?<8<KU&Z7L(Hv~$=e}A@LAy?a@Az?y}nvOPsU1v*i4r>G3&q5;XGB1PG
zBaeeaIwEtr8vKhq!F20+C&L@$AIJq9?FTNSP5r+IMLC<-AqkRKkJ(#!J@D&WLorl-
zBJ{G)s};masK%wBwQhnKm8#1(?*{ft<)8-rjr88oW^ImPJEyr<D?VFZDPvDCjYE7C
zrq@i8kp&sw<hi`LDC||iFP!6p`x+d;sI@p56&&|sF%psfQ(<K5%mUpe(s`0E5c+dL
z)r5(`;>O$S^^hWVStX6#g`Ojk7*qJYhvdU>fL~cYvnK^SoO7`E(|wQPm~7W?O~YxF
z_^CTRN5E7gGy<g*_3zKGaMfYT9UQj7c}ojfwkekQ_HgrG{_J^bB-3eG5t<;5+6uFA
zPip-p_GVcstEy^Q(az^;MPPkb`xEI;pH6P8s;Mb$v6lZgnFq23<xlp1Hv24gPMR4E
zDP;+04($9V;sCyGWhE{o#yKDXn=GKQ04Roe2gsFtw*hU9=Mg;ZCLRc*q<|ZJ_Vw->
zcU0(CL+n=R*<&4$_1M|o<U`-cI`prGPTFWafAtu4k?7Y3^*dyLx~W=VE%$1hj+;ya
zS}oP-4Y0jg@Yr%=<f#FmXG}N4wQPlfbIK&LynGnB#O`ym`7DVS{q7*Y=m-LKyP+*G
z20AbaTR$wCB`Vw4@*)0cEgNYth?ko-tBFKYek}R^Apwk#XMiVK9Qio2kHyl;>+|KD
zkPGRwGoLfl%~{6lOkFXbxgS=RUXgO9;h)uQ`^fuB!oX7WXZkZqrD?MlsR|-LI-=u8
z&ffa7b}MU`{~=8R;ae|qQpGK;U5Ax^+0o2<itK7)8){;2lVMXYCNZzg*H-_bEsN4&
zlnUmVt2gdyl5N+EZHeYXlm=K`L&yq^Z_a9ybOeu7^w6`rNc%=1nw&~)Pw&#qC!CXn
zktZ+-dT_RYtaS?vKH?>7RpY|TBKvN~)wGM+?bFP+X{K(8f()-eQRQa^**A+lM3sUx
z^#|48lvZ%mTYeOIC{ZeHX}O_=d2lXA3yHFV{*}p%`1<cu=~`H;Dx*OHO@vBMGUX1B
zZ69I(_rtcm_o8^Mt2Q3GdBVm1q6re8tnS^90m5Q@)rCeGwE*+bzoY4=nIWgU+vh|w
zVU|<e*F0VdUjC+H0{t&P#8IOcSU%i;FdKGe5_Jmr_pm#6byb9w=-4vb`hRDeE=hk}
z#7&96q*6zW?QDKGzstEE!|T)}OK-nuLUM3XX>PLqjFAl$(ji_rx0q^IhrPe)%)jpV
z#^`a^87?BUlElSQcQyHQ)I!j^C3SlYD*=POx<wR_dGSr%1=E#8G&;AHPNWWcT)^*}
zB9^u45(iI<o<Bm&H-9?8uM;7;!mkZy-S_N}$o6=8_7~>|<7WI>Xmt!Lm*P6%Bd>~`
zoJ-6k*8s>DqM*EI=^JUGe_Z`&x##?O4OR8Mg+HMow&b7N;rk+GkkW66yj(|(U8Cq6
zePia|`OP~A!HTf@lD!Mlur~wC+rc$cp0BffMmc2n;y(5zK`zN;rjY~G1laz|J<SZM
zgk()mZwJb!bgbMNk>yY#<H-K&jvk1Dtp6aoJ#pn^GMR{YPD*`!O?m4T<rsI{FRWvh
zmM&TyN=o!O?riR>BNR(^Q+t-a7?!ToCJL#C*R7S|kDP@%YpNI8sGYK0nfK^jJLKMT
z%F@t+6FIpY$n8NO;7`qT|ElD=4W!sif3OAPde%g6nvpZsM<UojZ8rJH_;{c^Y8Mtp
zD6JWB4KpS)dHG$!POzO*DVKzDhHHMEN*8$lMzLVjjbdC4vFr_t;$_9l*-G<(W^}z!
zfnoCIw;`8}uS*8px_v=rKSD^mDKEvssw|n<Jl#_SDDbEkt^Rb^Uxn1PWb4t;kV77F
zk-z=n*g99Nb{#1Kq%cn9VMG|_=06+i4RdO_LnKg0-RU+!6e$gLo-kqul!JLvzXwSf
z6ENwFoYfHLRd!m4@|M0d>8O+9b$O$W#OR|@<C1FzyZBK-;dud1BEte92*dXUW<U$J
z;KZ)zV5rYoQU$KQs&cs}Ty!aIkT&P-EN#}^)ah373+W<v?()j|!$*$YYf`X3ZY!W)
z`=c%B-O;rTwuJveKr$O$)cldX?P8h`$HfCf&bWIPWS5pFwcJ`4RV-umdc)tu(`xd(
z+z|vr_qiqUYO*gRuIEiJ>@%Z^y(0zbB_J_h42@C+B2PE*Y#OJ&>V*K}cqmCD^2`?a
z+;a+0O?u6@-@@EFNLa;ci><+b2+J5GaBYQw{w)#$I+N9*`y{aJhoIW?wAZ`djc++7
z8~wYsm6;vk2?Cih)l=e{^(8-+JU1sUw}JO{A9M&D5CzduTEte8m#t4|xYN{r_pUhl
zey^Hw2-|4Kr$CT>h!ci?Lw0yx6eqh~#??+Sw>XGdZBEA9MNwC}1@1L%?WjvPi-^%i
zH+_utjWhKVvQt_7d`53M)6q|CmmT9nA~)fMwCC<Lut^81Tf9!E0QTY@Qv>kpfjCAE
z$8=b7@+TuL7E&A`*U5F2{iFW-%%iNQQTKW^PM6r!12xW$i90)uP5VG<{`QW!q&yk8
z-1L7r8b|-9l^XTO9ld3XbqyA3Kd^G`s-<iM-azwSv@?(>tytUsOcR70s{&F;{|j1M
z3T$ED{uTa=q^7Q(+Y6>S?tBz2PFwY~uq%HQaG@mWwQ!T$CFML!MIUWC3TfYL@j90C
z5H&P}5p%7=CJfy*W0c4q1%O`)-Lqsb4cqDI=N&N|VkyUeSqI$Fr~tc6?9WX9*Qh?9
z<tcw-yrPI!^#%|+8ml<6^YgE#%x99)df2&LeF<^LO-Cr`COtbGzWOw^c{mp}v|sQM
zrVoVQT62?Vr>!kYm_W*aq?RYlGMn6z4wND~5H1O&TNKzcj6Rsm|AlaEq9h{?-|DSw
zH~+Sn_o|AZ&qpq%bk9ql6PVT`Y@gv?=KwfgAR<w26!L?=yac6S?@Q`qzvyX9`K^6>
z(%sJ6b-MAu1!vtGvgO1SvG|&oAV0EW7(+TT^VF|)a?2eAP6%Bmq7Huuy=Y*}V^4?T
zJXsp+TivbF^;^Sg;$zP5ni1y@%7!TM8`qoh9|;Ms*T1Xh8=s4=STS?t5opt$@-jok
z+xGbbUGZF+rvYjqtS0J9Nf2~h;Hu1a<TG0KG@4y#35aUOw4Ud0layJdJOg3(V2i0n
zuN9O#z3lPh_X~3T`NS|{$5>Y+ML>4i3JZHSWQn<RrCQN^3P=YT+QZobRa(nxUdMsM
zQzc&wzwG=S9J{#a{TK!Xb{S3iJpSfh<Ibpe%$jp+Qfqn8-uq`K<!o@+s>|uW1>oNe
zLSh?|ns0b{9NXRVl!6*Yuh+d<@KvSTvJlIf_jHLvqlp1ErMJ!3i-S%F3OmpCvI+;b
zt@Ih_JVxAa(%Or<J^OfKm=z&A!!-xOSijrT7s*-pBozA0TP3ymr`LPCw#;Sqzx&IA
zo1EVxl-Kv8+jf^Vd~FUEJ9*OQ55AuG%ny{FN;fla`ZLh5BRejLh?4D1#&teNAwF|N
z63dBS<s1sr!qoduZ9DwDLWP=C5U%Fy2z_E<cf_RkP(?=t(Mp=$SqsleTdzPLn2_=$
zsfT84DjkMcMhw9fer}E(6#Y5Em-lohFK+}ZvkrWq6w4FNMK@|`B+l!q?!NLr#qKnB
z{aZZKf|BtbYj1$rdFP810Eh+TBCbE-(Ms9|NELVbE*h|Ohmmz$+LM8>?)O<o0I1KH
zoYy!g)LIL~$$u2fKwFu^@{d1kS=m1A3OA3&|0<Cr1@sKij9joBQXR$SW`<Z<ZUx)o
z$Z-RC1P|Md{?;<Nk>%x{qn)5dl}K|s_jd_8e=#G%AA(MORRd-0d;p16mWzd@IFK&5
z)~ZfcO@8O%5C#C$1Au}SfCSO9L!TQ?Q>4lRIFJe)m9*gfI3SO|xAtlodxUf(KmN|G
zUjCO`j^$O$a#Gl@h9pK=1p7NZIM>B@m8HV7!&<Jcu8H3qe=Kk78KNiUQGwU9W4<Z5
zRhAVyjy6`3+d32L8}Y!WjAb{}ubsQuUi5(Pzo#tM{#oJ<P1C5y-5{<Jr8mof^eOz0
zSb0c}_MfO@JkUlQ9_}5@(*BiVJNV}th_j{vtd)d$Xq}Z7i-@om#~=pQ=gw~kf8A<t
zQ5w{lxWHVUadXidd*@cns@WwFij*<uSqs;vv|o<~Vdk<0{G3(@l0~UPHU-D3qdTfw
z6OQ!s!azMb>{=i@@oi1bHFNH`bL%+LHp`0ATEi@C<SA8}nU^Lt)=ZQlzhoj&m;LG%
zCaGg{2za^}Q@3nm6ADTP&R`l=;(3W$m1-8(ViIgmy-U!lEEef^SOfc6XTsnTv(ioN
z#FpPFb3e^sa8gU{*7<sQioaE0O_f;IoTQhU=h}SPQ*=qS2Y*(Gc=3}2=3aK-vWpRE
z7$mH-$dQUCA*EGfnA}vS>FnEMCvvhg*tr`A>rW6y+DRrtLUBEsiEkeX1uO2Z%r}76
zGGF?ryxssA#c<^vbTiUbqN);hj4f9^B0Jg@M}hEFGC>}^9d$e!RZxC(gl)`FPSmTm
z84wil0bk{-3)IL6vi?wY2@ymmp0Y5;#@Hs-F@2@;;ydc@T+W9@tn%)uM(|lwX7F|j
zV_VdPL+Pa77mJ7|4#f5ub;izz=2Y$3bh#AZytrRoKO6KVNF!r0VtD@r_u8{URqBDs
z^Rv@+e=lpSjV`$@2ma-EEW>YUi`h@~`)%UWz9PDHfv)?SNt$c4e(qmdrsjs^Ms`#=
z8r{hepqOcj%gFijg@Qt!2w)5s9v&VX(eZDe=wp<Tn)VIA-^vkS+Q{;EZBJ^mh#A_d
zyV@EoU6B!;Mp!nD$oqgN=qBYD&_nU3tRs<zA8b9XvqX(LJ|PuHu^S4?WY(VgDJ%Vv
zmTXp!f@#--yZ@u;+~b-4|0uq>CD$#*T#AWm?uy(>E>kXF%AL92FLQ~x-$N{yM50_u
za%bc+_fZiQ=00*6a#>m|VRHX{e*4#h2ea+{dcV#&&y$&Dio;VGZU8`Wj${Y{n>PNk
z+f7up5a8}sHo4i8#1*~nQ-D?e^8V6N#G?TFz^|oE+xt7~+vTum#Ku7cBM5+q0TmPM
zKL-X<v4CBJqn_$~4>^8IdfS`NoOZMo1Ym`{^88xzF-QOYK{{&jd6+8PZZs|a|6tT9
zYUv7)aBj{st;9p&Hw67cZVA0skAfX&nuFl~&S`1Hx!Kv^#DPr-RwyIcNg3NTusVjq
zQ{V5AS^gtyg+4I5n2D*?1NtnJEgs{atWm=4-F-zxJH^*0ucQF?L_DdM>XToCSLx3=
zH;NiidD5hat50o!p5#joJj|9BxZqjs#CZP26`;NbS4$PpI$G}say++=j%l&o^L`f3
z%?q@dD2xn1UPE^h<&BP>Qlk*zLP=myjg5xptN8}09JYK@_carnG$_P(_90Pv+a&OH
z89*z{Ce_kVzK9pu4Tcdn5-w+Y3?o5u_ZzuihiR{KDs}XqYhUy+Z*VE~dx>Ny0xUFL
zZ(63}l+Ii_2YstkgEz6Mtjx}Ctm5c#Ow8HqQgpgPnI;C^RoFkY-(CZ0>O^tmgz85U
z8qQ(^5r%oox4*#ZHE~$eY6euGyzmRX)O6?U&=$-Ez}xchu0A4n%$vvj3JVj1D=mtZ
zd`9iXO@@mA0K8y3&!l>?Rb29psd4_$A%~|kf8;D1w(N)Ih{b1jIBnj}g5-Q(>KgpX
z&GT^<NMr8gE^0d}wP~#&e{Fp?B3${7s=weRIi8L}r%}#gvkP8jUwHd#ina^Zg3kgx
zH|(F)RY2GQ5GhOj!p=W|CFK68op)qJM1zxm7cdaaW;v-hUVPdy@M}|9Q1q?h31`A~
z%=pvUrBg21gwXDrOn)!ogWjahIyZ95&)<|%TVGgy^qw;%y5@Ixwf`Ho4-K<>pMR>)
zo*((d^r~?C%Jo3{?O%saBHrA6NU3$Ssa`m~7;QsPBAX(i9|B+hcl-Wc*zrbN!jNu~
zXJ*nY-wfLi2sjhgnb89sfkKnXeK3__^_!V)dTrS;us~w;LU@Rwj(M(^@rZ?l+R0Wq
z?aie(5U*F1+SJ7J9Y)Bp_Pdm)$BPM!m9h(fEj4-Z_V{i@W$BMIg!x+hq}OY%{-v$;
zk3SBj=J9%wC-hr}&*#B|?CUSvA-`XonW@=k0Qt34-+rz_$vP(@XF63Vq-Ly#pcMEB
z0to%4E=N&`Hhnbf!94fE@lO96V4?#8AuN;QTYv7H+&Sq#UFtsO#!sG}ESw%XtaX<(
z8I2o)BP3%@err+Y%a~#1Uw6K}SIKq)KvuiW#2adW<uRGG5^@o+py2_Locy@^c-O!C
z=r7>l&_<g3j0MV6EfS~^&owSxdP9^?yfSHPk#qlTksw`Jc8!JFzPYs}Pw5;?-EwWJ
z4y++e*c(rd&Dm6!qT+L3c}KIZy&(TRUvp$Rd3Sw;n{LrW>IZ6G+$sS~2a45@xLCk9
znr&cIISJ4`{T9dQYwykenKWhlsNaMGXY;?K{pa2;bYdF-$(b6E<cSpat{iAsLutGy
z#QcW9S8K+SY(J>>)aJ@0)l#Ea;Y=4c0zLd;qQo{i_8XEKX&>v+_)FYZFT5-p9X#vM
zeG3TMOb1sjhEeBcU?TnUOjl4-owaBbpMk!@mr;5DUS_8>=-^($2oef@Qow!q_AS!$
zHK%CcpxvKA&FS-@Lo7F-ob#C%@e-oa<-NThCQwvvZC05&J>Tx;*Uv3LntA^mH|1yI
z_stWw6R+VOc~+u%aHa1ta3=5&)mxGeD#-AgaKo{zi6#EZy{2(IJ}1yPvmuoI-{Mdj
z4R`({7ZV6vba8|G?RHEh4qU7dKvbcj5zT?}BNk7du(G)>#$jqFZM}V^S8dUfrTkK>
z32KGh$8d6AfY4R&?n?fAtyzIb&@~D3syNZ5PAR}1;?Q+^a>y;Lez53`j%PERcfVtN
zg^S4ZP&)nMb+uV)!MnG#_)y#$uuO~IKTu3$Wn%y!XveNk`vAy<7fe;mUG&Ou6nFTX
zZIA!+WutMQHIyiy^Btk~+HD(6#yLyGTnJaRk{kUI-Op{liLc$}*5<Tm@$I(R+_}AS
zUyV=sd;4~a>C#QJ|C~e@ebW=7<+YIv*JZN5WX>~TOW<)<pLs=DMSSQ@3r}`iM!TY!
z1+{gN?piIL7eGy^aDb)k8yNcgd}svsqn2Hv2WKy;-bFgr$t0f41#8FNO7epNh`lLs
zzAwJOUj0&Xfe99B%nu;X8rFK&Cm#D5uJ@E@+&TIO2p-zfyDbY{R(d90z$vw~T+R*b
z248o_6y|;<bEoqghV!~0+Tydb>ZS}3FD~}O%z~}mFI3_uE&N+%ugl_^+0r9(NI7no
z4wjbIm!een?CvL4PGkcuIxcAWyR`YvnkU~AK?4o?`M-mjKdnydh<4i5m6{oge@aev
z5k3SqPq9V_;Q1AV#{lkO{3PrUMO`?dw+yTTd&~vMXt+f%P$3iCdn||jLS4mEBd+8D
zaQlYRGS&X4X<8bi7F5|gr=N&$=d`U3n)>BG-6z!U<Js<G8$|kdxO4Rd){Oq=x7jE_
zZJ{<W(qQ!5dqs+umtJzM@9tih2p1-L&<bpFR)JDpJ6`DhewV2|8)fD=yQgUMe~F2u
z+2OCyzB#=0a2AQ)L84kn)wia~M1XR^U@Hr7sgC2kaW{=ev|?=Ra+smoe++{Fzm}w^
z8YNJ_EN`34gUUF#>jLwiKmz}OliAqAI$$?<0K|BQPl0V_J=!qGO;3eK1t=qGzQW87
z%0YR5UF%OV3^Y<{%eIJ|r5^)nw4IMxGi3p`<3Db5_dv9eHKneI9|fVr*&aW<ZJkng
zSr-lbeo`O)a&)$M>o(9_L7p_pyTIJ!{-i#!0_pRa`C8FAZ4cddIOH0+!>%qV3BdKq
zu+VNbMclerc)2p%dJSd3N>-u0MEjC5Qn0_8DjBi~DLH27S{*Z`T@~YZGbP0Fw2nL3
z5HwEZsyX<>6^4d%NQVZ-QIbC0Dk(Pp8!Mp}^3mlvitSmvcjX1}mwkw6(YoL}%w_~Y
z?9~6Ar=Z3OFtg$_8LW@uT>5p5%zOzpWtHanw#6V~hjhG26DdX1P>eE>myHw5`|$5p
z{0Js==a=eg`+ueGU~w-2n0hoI2MBi$sC3)%cI@2PQYNo?5!-G>hlyIa_pi|B<6xse
zGU<1xsaI(esu2s}k<4h>-CboU{;1-f!D_gA_@;|n<{zY!y-GvZ(d7KJ8FW@?l(VKc
z`q4Dg>0m7l(4E304-!HfSBEr@t_;m8;SNuNuFg!b6<%<~=q@$KN%|_vb=!RSy>Row
z8HNhKj}j6ys0#@+xiE`9yWjPXo;W)efwhh-${suNCaDdcq0yxod1<M9zfE&ul>A%t
zmB=KS86n>=OGAcEy{*w77MWH)o)O%=Y^*?W%l*?Qg`q@YA|QxG=b9YcIi;PVZGkAW
zDe-v8^Meu(&TXiTNw{y8^E}XXo&4tHj|*bsbob7w?Dj1%fMJ%uu+X#Yf<EWMG?Ja7
z3%I1W!v1X@(vrrvwtB_~sMOt6!0&-cdz(5>{XeiHG`dnJZ^EK(N>cO$XiGZ!w*+W)
z%bkO<hqDV1;9HWD$AMF*Z6&BpX299vYUAy5KY*_T`7F?4%R)@PJ{^Qq*g0MbdN4}e
z_5K=GPd%tryTlD#y}qfOZqQ=2i-6BXdr*$sMmrbigud5(e3bDfUjd}^DMtAv#1B>G
zX6Gy3Ft*p^7f?&DrCtlMyT!q%Yh!AplACoH1DF|T?SK?k^=m!oTd69&c_$I}-B+bb
z6cuQc$oh-m;_fI_3K{R&2To>NHYisQprqS??<p<~0>v)ud2hwo;*i01b_*&yInYT|
zV~zP0ADew<OCF1C=k&yAz9$d!C?;$-z>c`moS708N=GdkUyFvyFAl601j=YHB+FO{
z2c-?|JWCQ91bBslNeT6S?hJ=O2^}C%HPsF%7_|h<@z!O%!UDi3F)dICSz!dn47HwG
zDZF~K#hN9~BsFx3`@Zl&j9fDx)EkF)r-pxrhT_#)<5elF@0xPZc59Do#&nt;I$yl5
z-m<6L`;2{a-guf1`_{x=^&|Sd29N8_f|QFlmKvmHZl2s)wdD+8-f;0OYxv$9EFwX_
z{GLb4+!$Z!X<qGHI6i~Y7QH8=rBPN;1YoX|cvMyCx-Vx?-rr@Z@z|X6j77kQRWj23
zHW6_-E)G>K)IqK{{YqAYjOU&`<L`AHcr8@%O9Q{^<v`r$aN35^!}~=#uIydTK`X(r
z4AQMct?ZhEa}yBim_(BW265_8ds1VWBW2CqJLr?Xc?lY+s?DW1-@f9O8Y<Qyt{HcY
z6~aWIF8pePYeYD*GfWQ+7+9E-vR_dOZhv0fL@SIZg#VTbCYW(u-GeOE`+QfDxJr@o
zY`=_Recmz}^fjPC=8_RW?Yvbs(f+5OTlnO-`>?ztxq}vS+5wd4+;~V~Nl!CGoGx+4
zD$iXC884A=8<|sg_4uj^G%58}so`2kHXe@d_Bs2zyQR%lsH-nlCQ!I4cVrr`nVmEv
zz1{x9?#cpLJ+UQ5jzI+NE7TVtP#8R`(f-$MO8Y=Hj4U*CN7^X=P`^3w>rA9+ZZXky
zKC5%jztF;8Kchvnd8n3*_7(9~^v?@jB;05^JF->oA68hNg7EZ`wLe4Kow9gMy_((^
z9As}yG&HG@g@L?Tmdx>QUDy`vHKujdy5jQ=90cH6qIb1KHP4~JdTHQ5Zdx}A5XfwE
z&gZOd(&?7n2Y5VCv{8?bQSD+_ge)s3!9K-wJCNLBP||t9b}^^(G(f|%cd%deUSWs}
zrCkCR9##hXOT!C5_svt6YFr@DU*9tO*+~kpDPd;P@xe<(QT$~1Ym7c|ACjPz>H2fN
z17O@Z!IIs8?Ig*{k3gblzs>;Ga8c?GIR%Y-m*k;N{?f|+g>Fm7PIZ|=@IEi0uOs8*
zrqYvVD4q9^qBpoYg7vyjdqQ#eH^;wcVaxl0mH0kbt{ZTn1qebj!gM7iC3zNi^*8r(
z2j;wi^rrl#l`>9`KQjHTb}xiLw>x_SC78Y?4;l4q;d?avru$ZB<iEYsp3`TfRqpQ7
z<%~Nq-$clV^L~~#c2iQHRIf~;HVe+mXWOosB)WA2a7e&fLp5myMvIvn44_{DUgjIc
z5z6tHn@P_wyg71Vo6&Z>1aeLR;KLSC^>|f}_bW^~@EHbJco^bjxUiNSvapRr$}h7x
zX!)<39B#)4l|zyvwabF&A0I=`3%e{G`^72?q0p4=S86sC&{nc=%+Urdrg|CEHHe^W
zd}m=|fBnU3Jm_pudinbA95*5&eHlRC7S7&sh@iSwkPdcsR*v~I6?O_)bH!U`0*&Ok
zSmRhSjjwuEOW&mxl)6v3HKs6zQH0{g%l2e!s#8+VUPzl1wyO0bnRt7=25QFpl{tPD
zQ*mnLmBI?c=&#lqFH86Q&YFs%AH3}(wF_Kr{^<1|jnFJ1LWWJc^YBAPx7=})YuvQk
zY`Pu_MHd<bgfg%n&ukFR-*ss@cO4`zCLoJDV?Z^O{Z>O>A{u@(*Qt@r_T#shb$F3`
z4odq!w=YY-dC=AHVIRyJ)%w^Gr*qCl>f8&IH4|iVQ$ttdAJ3&EQP3R9KizeCJSvTw
zcIS6U&%pYVx6^U|U0{9Y7%HeQoU`MTzW-gedw(i+dny3iKfWz|f)&w7Qb0>{Y10TL
zgFPdH&y9uiuYa6X(8)n{bgc~%T5?QJcGmmL%Vm*QnEY|@1cK$~8#VGoH4O!cJ3xo#
zAq!@G)f_A**1mZqQJ2&%m$t0n<sHXr>kCrta^+!@h^6ywVg3IGgbCgOimjVe3^Ssx
zyQb}OsY1`Cbhh!tm`t+A@%raxp=<L7tZaPo;3D0Ld#AGZMrM<+t@t<z5Y|R_0zaOr
z^ggH&=i7jMB>64{WV6U80@&VaETyIFx5lVH?{&sM&wGph3@a>jyd{E#i?Cxat}mSY
zUMR@9C5x8Czb3v>ze8SFh&~)@S(b}=!v^}jcL+NpeAHeu?x*y$r@+Dk#|l6`K-ch}
zVQ7G2w$X9$SvKZ-#NlS>0->vGGo+*Mb_)<97b+(m0Cj&RU+3Bb=$j`?Cv=O%T({`x
zXfOz!2LFX;ruI3F-0Hc1Cnn-{SD*(TB%Ja8rtdaepvtCG%;Q*@lYq~4)6AZoo$WYO
zLv|L(;LItX6b$8=M53aj+Xnd{j!D^LkavxKg}K8SZz$pFb3!OeKc#%Y%|d%?J?P|U
z|AxrZ=m+-$a1x@9=1MP$19)^cWQ*cm61iy$-I1p!fhu-HOl9zE4@&1ZaZ>QcC*0)r
z{`vSl_|I50&X*8LONN;mv5DEBorUx((Gf%hdiqQwY&pp(h*)`8j_1r<#|3YC*)cO^
z7BsRdKYd4XT`|et6zH#?sOsrUjss9II``!tAD&-&P6nE3B?kY){*Z#EFTZSVr__kv
z)t<ob;uK34CNd<GP?9R>XUShx0DzK?xHL|YTA=9Oy}<ql$hT`CQ{+D>-k9-R^r@5J
z$M=ELbYEh=A<)fl?bRs!JKqJ9<(caErcF}z?C<<#J6J7%KC`ZYi!LX~wv}XaWcs{?
z&8bbH61YiOAS^c6Pw3v}(ahnkVQdizOH~w5WfxIFb0=$02-!fi#=e~OOY@B4!0n+L
ze&}(0E3BLyfsE2gTRC-aX*vjI@E6gy7k=8QrdC8bKJQ$*GP<n}coXWbO}A|;&i;{r
zFdBZ=NmLAOO`5Gt5ARUA?6;7KRl89M)=tAS5IkN{fHtZ)NW_E2B4dwdFOOsV3vH&%
z{r_ByQ)4Xp&YF=fEB5cJ2aXYWU#klR1yB|So`c6d$9GOTVq0TdPq#A`j+ZjJD?3H&
zA*rk=%&Ff3imF47Zhb?4Cv%mc1iA~S6wD|mE7urk-SU1P$2|(Up6XM5zQy9~M^)L8
zNdLJ*DixR?8{a?EG;BB7xdb5vvyU{81pUokC;q~ouoq>3kO|9r=&)yL_3FJ|Gxs|Z
zwCdZ-?I{B`Rbv+(n5mrPb3N=?X>B2Sf2HP)nb<@>kAheWGr&;>W9fbh4Q(Sg1B%s6
z@YzjLIPWrVX_cv&G<=^k<xyLH%k4iBU&1ca|5Qy)=iHAv#LSu=Uh$<=>-_evyUQG6
z9uzcRU7JeTMf>xZc-_zvNpS6j-Ml-(0EbDEK+hm&;f+DP!m&s3t~)yz;$zsI*a)e%
zn4@3S+`?TE(YKS7t<Zpv8ii15Q6fj){<re$46I!5rGtZgTc~i)(9#I%xJNkVbh%u(
z+{txqdp`}S=As&k!<jCh+YN~gjm2Piw8NqS(vHD%?Kl%jQ5H%dQTF}-b!#`U?I=HX
z+y7);_|(ff2WXBv1ZMgJz@Z!Ak2K0>0gp!h-<#nC`vJ%aQR{;}?o-fJ!`)V;zQjcr
zp*1(4<x#ZPt002<raQ_Su#1wMV1zwH>3mZoF;iZ4SeDu#!jL+*Ft0|F$+gDJxulKz
z^%w5?L%*t)`zCjIS?V7Hi$W9qh}S;@j{dCz<|2NL2scqS`1^<c!9vN$e(^jd*&!WY
zRnD-MP7JITm&H@cDFqD-nj*z`4;2b}B{7(UC!we$Krls(q6wIa;9;ETk{kz71)WB=
zlFe<y^J>p+?DdW0*s@kx+6MB1<UT7?YA`c7&rO<1^jhVV11IZ`q8r(oV$fP#APpBR
z@%U9f%PZA0aXoSsKR2}+<qjbe*N;L@fR2^m$s;bNL2=1z?Yj;UQ;`E`>nSsRJm~`~
zF>-Tel?>6${Vctu=^~bhcG+|j0v6z7?Gq@k?XOyt*CwJR5W@9LflFnNUscsAlvLdY
zw#o%+7*;1>_<=>Uf=tlN8t6jSh8l(+gkhCB_RDN=7!6gu-A+e&`|5k4Kg|hlI;Vj@
zkkr_$*hwOA?<_H|FRDfGafn{M@KUXEG_C!kpvwZ2hpw(h)yYphA~i{{y=lH`iLLWo
zX555DCTj`fyPf7Tb7bds;soj*lY%RbQ~l|9NlemA9`R;+1didUb@bk+xx-Ub8=bG3
zJqs^Z6mTO(sOw}ce~-4%9}osSBKEfwu7Y_TANN^>wWQ)caRc(^=3b~Uxm&nAKqz^<
z^5YASU7`&&@GD2OKqzq^_Gx3D5TvU@ZNcY2(zJ$9FYOJSc|)b^6>}U7K4%j<*8o5X
zX4Kk4s-kqz52Xh-=6fH<TJ>7X#w9I9iJlxV=SAIY4mo*DVlZnCeR0c&W7>QgE=p*z
zH?Yrj>ndpqte(=;@J5Uech~a*bH4Pohs&*zb@-ZrPMOA=3UqtSH@qT6TPS{~j{sfD
zS!}Wft)z>2tb5Nt7%fR6+;A}bhWs`P_j$4}I9Ym_Ek6UT)U?U>TTLeewD`MXvtAn?
z1vFi$FzNsAjRyp7b-S6K1x20wE#_ABz1?hL5E&l6s~)wz(K6Q-7z+^W<T!E%_8+$L
zmv6moF2H>Ie68Czjcqc*9}7e$_5)QeWsLi^2w&CMkr$!^FAtPIvF(9&Ww2qjob2||
z@tad2t%<F#gE%~AwI;Hc9BGKaQ9F`DW4D!Jmk9t>nG_~A^M}*pB7fuOxpfP`z@Yy%
zk{hd1TJRiyrSD}RX@!{<&5;l+gVPff()agwY8_VqaG8)|jJ>u-ttCr^`Hk4ddKRP!
zDv}ClOh>?xE<a4oxN`y+|B9!kF2|i;E&D<z4rJ+9jw`%C*?0k~BqYqmOysiw8mAeD
zl7y2N=5OEXG*CJIbIcvFZ4MebGyIKzr>hK4>LW-BmH%nod(E*s2LVZWLij+Q7lLu(
zl`lD46*;xD%Cd|!3aR9_Y@mxLlRwF6Nat}l;wrAhdGxQttB?*dQ28PHqu?Hrm!$YP
z@;*x>Bl<sY&Jr5;ndns6ZJi}WT8|u;2*jH@2d&(b?zvRk5{Q(`;qZDL2p2^c%AbqB
zR}yY-RPG|f4Azk=&%()g)adf1R&dM<EGJkVIn=$t1=HYM0d_h%l>DZWJ!b=~eR@+;
z4QrV2=B6;y!&e^Y!e1%oI;&^<132U>uC_E^sYw^KeF+}5&ERwK8jpU0)xen(cnHBP
znJp6*aa2$Oi9J-a*F>}0WJhA7(==EdlS{7plYez59dyRY?#UMc1747gi_^_MpO3Sa
zDVp88NukjFpONUnbIu~@L>-P^rwJ^-h^xC@9mH6Jn421BWk6Cp)4YZXoJ`CkO)EEp
zaKWW1iqDu`F*Y6-mpI^wFcuBhXKzk_<(|_0Pf5umk<y69_I&VS`*+X}xu;a|)2)nd
zphFheBgX0=XY7i8r^@%Kup?FfplI;Cj*jFvr@tu(N_ja2)(>I8#yJur^qfV3I2=rK
z+M1fsLRvDv{ZWb}B0?v7`;zb7E{>h~@MlAee2$TWAC2Q)-h9|vNIxUY=r@-A=GpC;
z-pG8RXJCzii_ooh?M482Z7&17A1AV2&F=nkl9EjDwW&7h3}>%=U|95CuB8gTPEi;p
zm^*FaNGJuCCpKu@%%<Yi)f7>CC*%dMsz5W>#<%{}u<NH{g=fs)fgvIl%P`}4m-ay4
zZrICi$FXMPa&<<{by6d{tMof3^h@0@AP_Yv^f*x@F1+t}UYfrVASO4J?L^#C_1N6p
zgt@|!9FBMEfDuq>Bjoh2!-)aFxN}S!-K%RVgQei<ODBFutN+%wbIbet-Cmm#NqKuT
zvSAM_eu1{vBUge_%(STEr)H6Yy$G=<N8eR2k&%(Hr~e=K_M>X$Ir>J{cOtrM^=tcP
zAfNUY07&S<(b{;q>~_O7ZVn9~oUyFD5pqx|7LS^3FM~t@<!C_3`^(3#FbFTZ!=$WC
zyu~t7^TC;NxmHKKrCax%l58sl*s~p!o%!kA2~<8s=lhjRGRn})3&$qME~w!SEE$Xt
z>jv|AeNJQT?w|(;d9W|4$cg6j^ExPW|1uN;fNpCVx5sM`EP29Tq;Mf?*M|*QHP>&q
z<(=6%GW2S^1~-w@G~k1dhI^zpr{|%E-2JDl%qh2=Db#&$9J?mHBa<04)F9(!C~^Fd
zh!}foI;3Whh!0GY=c>=nB8f>^EK5JbVk4)X8#QhxaJwtWBVPno$Itmh$murFiP#Se
zuAYAqifvHgH+n0AvN1+4?^Cu{qegYbGFd>x_5cAFp`CulYY{x9)dr!ngAsg?|2*p?
z%3Fpqo_SXif>Vs4IFul31VxLrlvn$1cl=a};#|u230ZVHBYchA_U_}nUwIaBk$?}e
z%OR&U?9Gn#%$5FfoArCg6`QvZCo+eI8n?@nP@}LRwnegyZ<|G{Q4P`NL+2GZmQ`Sd
z_@Y{l^$W|koWDXkc|Ue$|AT-_jix9si*}r6!3D5~9i=SaZCQr!*-1kine+%+e|-<d
zAwadG*1QSQ!NuV#3U~5>r$4f8X0C@J4rq({(*I}kIJx_<qx<w1wJwr|wpVC1b2{Bv
zv21?)7=n|(6LYc?p}JXqyU_~W9tT&*(T(M)-nzvED)p#VOap;IAU&j}S<b-qmYJG>
zBvz=6WkhvD5OU`uH*kd1<JW3#xIRfsx+{mp(>FImy6=EosJK19!W*77^}IXLGa>9l
zi-CQrl!U#2m-LSO*?^hM0M=H;1+dQ)Yq;q!yThtXd)C=!d5Yc$uUEPIorAdrYh9BI
z>y&|W{N`_u9>1`96IZ=lJJ|^cY}Ui&1(xT<kG#>GBXNmjG#WqYoOUTC_G2eFE=Wf<
z(*ILhEPpt2M5zYU#AtiNHwI{se5eJIzM6=u(WKd86TJh?;!$7da_{S%Ii9a0<HN+i
zpG=)b?rl2dg}DBQ+YJI5&7vNLwkXB!g~moI|3>3lLib27PmkW5ZVwDlm)t$l0M-VI
z0^k;I87;@ObaZz|?wt@qyF%UlyzKVIbb#FNyY4o^JS`O7O8{)=;YWY1&_j6X0XB}l
z(ajSXmRL;d_l?jhR*@ODSIAz0sfs-dG;a9*a9#g9eHZZJVYnB;C{Z*@{UQd@SCnJ2
zrYN0c)(o`yG!$Tr0GR2pAR4q3rM_C8g;^pVEIxI4Pei%Y1m_O8z2<|kxPZh;YYs~<
z3CXv@lF`$hB~6`|93E?00a0QkQWs$Z)O**sCG|{1-ft6$t1ru*^PXIhMswC%Ppujb
zyK`K0u$L(x-;V-o42`tg%Jfao`z24U7-=UGcgbF<vzersPAR)pEYWM?ZQKrNDmWdU
z1S!Da@Y}1!(FwYq4{!~^c6_1*ea4&vAS<-f63SZ!2#5sx&t<&UT%1?_@Ae;nXyIp>
zdUF}L2I42l{Fx~KZMQD{V&r{MufqCVlV=YE&0qFf4Mg4kfge!2>*AxYw<k7z?qlTy
z&}dG$+$isy?uAsD@)D&)(KwY5;DD*&0Mj(8{;ME&dS2ZlOY64ah%r2A=em(=Vet0#
zI%*ZA0~-W&_9JLAf7aU7mSD)UuARz8Fu9zyQfLiqp<-WJ`jpbih05rgy!!KeAeJW1
zKQq^LErEJf!b;~7@GhlPky+96=R#gww~V^i?d}h990@6macz=LZ&QL^w-LL*=<nr#
z6g7!VaIaA$-W3}O>?$y*eFeg%opV8Y7F>2w6nI_AsNqY{qmnY=$O)6*q$h=*FP}Qk
zMAY8g*hiX{@Vj&!(DRc+X~JZ};r?*!$)u$)_#-MQy<?q~2}os)D`iN6MmOJc_gERU
zN!@^ZDX2cv5fhOkG0_y}s#`*$qBjg#zb04IsepD+C>&PfTH6FUAMQLOuxA#fk;LjK
zP=1(v2um8L>Y%l<KQ~3t7N34I7~?Jer45S`>H*e1F)iI&0U|EVOZfF05D-%fRuB51
z54Tq<nFl|2WmCG;t*c;azhG=ee+;al3rc!4Azp<Tkl>wIy8MYSX<=YwfXa1KUk%>G
zRm61d$;{Q&zar!mUySaxGrNU|pP4e%AF9l$;S4N#wv%&QVN(YvTn{dTZT0HkWG5zm
zY2ZXpdiq`0i!-EYCAv-@{JM<(9nYrJOdO!5(+>E$)uTFOy!0^ADW@S~`}FF%)5Ft~
z1wdFi8CcF8-+yfM%wQN#o)hx(^SA0zl*!G;e0w4_FGLquY9BN}02J0)LwOcy+dXo+
zya>YIP^GkAHluANc&K#33FZ$}2DP-vGjZOm(V0|_w&p#)l~g^UoU|CN{F@u$f&S0q
z`v$i}`+#^t+?v?BopRlJkm`*A<);VCz}?UlPgO~5mD*k&A72gBLohmey?6}0I_8mE
z0l3WIU`l6wb0}vq4|Iu$4T_9R&>fU6-CbMzb25LZSZb&@YGaS7G=F4|PTPCC``W}}
zsEScU0Xe6ZYPR`1jj_n2O0M6jwS(Y=e|?{lZ6aY_%bY-JhcSL&aV{{rcUxvMP4($=
zd##9b+UN&%-dY@6sy)m_NQ)x^V8L2&@R+1&%K*Omg2vh)tS(D#@GA;_QbNT0UNA|8
zR)fbio1|i4#d2Zb5kMOhOhpP8Y-1+`%d-`fzK74+42#>i6sr2VNv12frpdQo^Vjlu
zhVHm*mmvMEy_OJ)PH*ILegIA0OK7GB^8!p6K*;*ybMvy+^)jsu+*eYuLEgvSnG$BX
zM2NHNaEhoz;588vj7E^>g4+12@b%WFzaw`n#`pcM`m58?zoRmDi{lTy(?{?R(>qLU
zuhT1F^r9BXTLsqQI(NswAk3uB1H^gFXI$<b3PCOe`6LJ$vEBJlI_QEO54<R{LlnaA
z0v*9R-w}|;I;I)_{JQT*FoozgkCk_g@*qk4XkV;+-&<0%{o25O&E*>$+F0;2nAPn`
zhBVz2nP#%Wf8EVxS94i|+!<C#9q-{<Vk4(aqtC)mfuaC<>|Xcj_qrH<Ed^xU2wDf^
z@0<1!WrJ}a;gHbo!-5<cCz?nf<O_@_WUIO^uG(*b1{$b0j_G)OQ+O9gj{<DD({hgC
zF?a>gN`oQ{6%koZxLq>8i)Jm|mg#!7X?p>slQ*4mMU?5mmg@Oy*U~D)8?}Y{<T&75
zPA{E>qSFK}RP`(Qx3htm#x#QGu(c-W@z)j*moMi?sGI?}wgL<aZLg@t1agh!WX{x0
z=*JNbJ|L}7IDkr<+exHG>20@7N)gW9FvulVK;s7qlVmh4y)EGxw(S$+Cqix(#vY?j
zun04?{+M*k@3J!{ZPUR16~Dw&xAKe57H<r?2ySK1YKyN0mefhnQZnxY(*7kGgZ$6+
zSBnMOeCA5-85r2!R}iEYU=DkNLPLQgb1(GH;ddq3x?#XCDs2Dyjn3l9Nl#46xTLG}
z>c5By=MN|w6rh@JjZ<V1r6~Px)qB7qR;CwL97=)|vGO>3)Z(ryfr*HOFMfG`etsbx
zh*Gn_*J0&9N7{jt33&Te!+t-++%Qyg`O;reoR1pF%lh@6TPfKhWd;V)pnVxMxp^C_
zBTAb;KJGt`x!;#<8)#H|-uzWR?8zWdnLcLG_vLq$9s>HRfIUUX&!0?)ee?e=JCX>u
z%Wr^QZ`N*<f4MKf8EWY3kK64d?9O~E)3V2S)ZzuueK0(v?xq1!TPUPs>-`;(AL~)7
zZ-(KCq1@b{xV#<i$FIOhX$OIGb=uXymPfoeK$RolK|o5qMKIZwgU-cZh^poUnV`TW
zEi$5gFXx+{8zXyRk+%vJYSnr@vHD-5CG+nS<oBK?eq^~`GJ27GcyS|w)@Ma2#K*vj
zu~GxYrEWme!}6mu$cd2JPt2f@!}+15By5gbc=T=iC7$!GB-r}6Px>c|s{4r29a4~<
zuk;I)=Z$I=X<eq#6hu$qRMQNgnX;9@i+5|M%Ieod4ev1fm*n0{>v1eJwa&q2+Zs`B
zSU1kLNowipU$8;Pb9c-eWgKQd=XH3bFND*)&M;VLuJ!4YF3eZ1PvF~3&`2aa5v9jm
ztm^Oj9CK~wDl(Dbh;K<9<-YI=us7$FPD&6Eq$;!A>yo%o!lRa?J`nV5Y9NneS|jmX
zGYVa8*IrNbdEvrh_GhC2FVnj;-_O0Q+y@!UYu$6R?v6T87-k1epwffmKYYSlHvGDG
z0rb1D&Q*Pf%<1!w7aC6aG^s@tAq0auV?$550Wj{u(UFHd7sCV83nHrzD_Rm~j(0pa
zGbVX1@C;d1mN_xfv^E>t8&0*=m}s3svoSG1Q7)P9TdsVHuv{cR{w%^@Jih*Kb$~Ch
z<ZX{Yd>RWr|2zotKKt*%L6!-jBT|Wj$ItGXCytvC>~ryKK}L_<(~fy%L-<n}BrRcE
zQ&w^IV5?c4-E$WEZ5!oV>gs#!v>Era##aQ`jI3snN?#1lX=+yiK;s9jqp-ikWaVj`
zmsjJ@+|Dw$uU=oNoidaxJ}<lOPdP!4OPFlx45K-01;;HCz#vfq-Xu_;nrG^b|InA4
zWos!98VVn4S-g60y4md5m18_sH%{aO%h^-sLI#nW&HE$1aa0<}^37cS*WVk%mo=(`
z?`#HE!Fzi)mVhpp|C@IX)@JL>C88rShet=B3b*sTeRKDH(l0!db?9m$^(h-I%apnn
z-Yzt{fB(KpDLm1?>!d;e(V>U5wmN_9YP|A)!rdnan=<8!8+#pq91?6DbF?E~y0d8~
z+;z!1%x>&DkaXK`=;iJ>OiqSH>7zZoq$<UbLe941zX-uVWiYbSHfMS3+8{NnU=L_e
zi1;~YLp`YX$JLdmy4Ma5{5{%9Ol)asQQsl8o=p#UNKRjFnx&xt<6V(P<;MPMFzlYW
zkTPY#er9{$3_b4CzTJ7W*3w;FIyhh(s5vI%m?J?g(5LD%K$n!$KdO0p2Qz3fELGo}
zLb*gzfjuuPB-zP+sxKZX?E%F?AxlwRp4=-WnWVn}rE&Ys@gPjK&<OF$_IhXt*{jdI
z``{zk(au7eaanp}<R1)0rR>i|ntI%41!qDG5&L1Pbps4D|NLsm&s9Uu{Uvkv6#agW
zr$3xdhD7=!aCrrJfzG@`*Ux|pcanQFtqMYWl)1DW{jd)Vc%WKAWS-11yFxXE2m4*(
zE`<r#<P%N2QNS&mQ$V;A3;V@v#W;wq*tM5shoU@!zdT+n#H33yL{P{UoT9Fe%p@L+
z-IzOctTA6~S&2}6jou)B2i(VMXjj}P1Ga(ZaL0;LI&ry4JCPJ==?`U>;{k(%z)UWz
z(^b_immeb9@b2lMH$f0j%<ueKJ3I*Vi9oNqB=~9V_O87-yP?RjK;yk6SnspZ8V#`1
z5C`>#6r{M37y7sjqpiuCRF;O5?_*1V-$44;P(e<4<k;<C?3RLq`fc^{?9R{*uh6hC
z%hiA`TakMCN#^9IRxbsuQgx>vgWal~x0-QJU}|x3VbsSU+&Zo@>a|VA;TG)_SnQt&
zpM30hLi^r@crp~^>~hEc8w!p6XZY<2_^TDu?-$k`kN7!Q-^E+G;+Q@919sFy$U@Hu
z!RO}0Xe5{mR1box{=lIDPh8yCKLR)_ePQUrJyN)fG{2cSwrX7(zGn~2y#BNW&hd{D
zm$06M4D0I;g8Mkf{bE~Rg&Zn#k?&CnSL6tfTN!m}bKc8DtY^)I^AJD6WP{#{;=S(z
zmq~JaO+XGFh4^GR(HZq3_S(wIGNfOgV|4#LzDcgdWDG#9m6qU3+E!lJOx`ttkbYfc
z0vmTV2h`zmHlt~G-`+L@)vsik=uyp80w4bc1*Dd)P@5A$3_%z6{Cx-8-M-EWnf`t-
zc{iV3;2Cbisp2L!C(2J(wOigHe>w7S=1=OnByE0U!?0GdcWINpNv-QTIH1<`d0j^W
z`W}-%SBuIj5Fk;1Go@%ZX-j{9CyX!f>w(^-Bqv^L9ewG61|f$GZklkA-IePaV`?q6
zCcpe}+)BngS&1sRf$I*>JYk}lZMOEON5^|AlmMa7<b>eOz_x6kVM^mCfCAu^2P9F<
z%G_QJmz!1+CPLH;bKLlo;9SP7f;=^t+o4cm+tHt%JukyT@uWq<8;VM<v(##V7PjNK
zw-EYvrE^BHui~XIt0o9lO&PaFR5(p+UsUwBlx1T66=l7n?}pnYrar8;%V&(_b0R!R
z4Q?(wLRI5$Cc?Bt5`nbyde|)ibOajV20l4zV{oJp325xSG$*?$-|~gAZExHzo&E>7
zKJ6XqA!`&fGKS|mwqu0)Y#Z|EyJr}uwcN|S{;QlIF%{KS|8)fh7ST>P*Ry1}M9);j
zl$bZ+vmiS0^RxQCz=cnU0#oJ`>m1+4(WG39YC1DdCU;EttXEO)6%aOT*$1l$vZ<yF
ze72?}$d+Q??{($Opc@76>GVxo1WK1;-jpH!IpT&?>bxeoQ?V26;uop^G{pWQ6D6|T
zd9_O(E2m)O*!B5(QQkMdCcoFq{8|CzA^<dldB3pB4w;AFchGp{K{Pc&aGDlkchG)i
zv(OB!F&()v2YyKPr*gy-_7qpiDc&75jd|~7Mj~k-|IvFQ`FvhH4i)D(R9i?9=%OsG
z4r4{p2R5^=<3)W=E(Q0%OSPXO#H0=uzR?%HDe^yJ1y^ABu9H-Ot8{^h{Wwa$lF*>b
z;WYjK?uu2h!sso(KQhYctBOtV9uAG9g3vitOI~+TZ-AmlaZ^NojMNlyu@YJBEJKnQ
z(Sug^G(3F4Di`kq5$<Jubg4q{7(@NT+2SjSptDbs^zwPcR?llG^o_oFVa5+;5LqhQ
z)nNrjc>)ISy8BPfW0!N@<oN@~?eTMG8QzN9<SWyh)8Ju0ouOwxCMlEpkVE}pt!=Xl
zbASZ2;iQ|WAt+li=TF0tvJ7bR6}iVV<<G_%WPlVFik<bneBt|+9N^DcBSYN=Qe$i$
zemMX>@6(nmGGr@00^gDwtS-CV_F_W+N1hw|Gjni75u^whDS}Ckf3m8#U8T<nXh9y}
z?#ZmT$AI~@MT(#5m>2)W+dRKH&hK~UVwbPMLC3%vhYMK8&w5lF{!2fa|B%J_TkwVn
z<8pA8FGem2=6vfnofN5|`g^Z0FWegB)-?#=ZTyG(G5P>=+gycXzNUMho?oXPvwLuG
zu)ocR7j_cYSRujz!!vMq2-&Pre2+IjCsqRM0d&l8bvt6;Gw|Oo&Mo@<Vd%2of_J+1
ztxFCkN9A{N3;g{3xs_V_Mu4iM*H6iLC^nVQHmQO3CNk7G_3zq#Ru*Zt1?Y|rjqI4E
zK_0y_MH$GzStxV29G;dxKiNcO22az{<;RtQ-r!zD9U-_Fs8N<8M_#1QXfm_L6=`cr
zs|hZSM}9xopfk=+*v4jKNGKy@AgRjG19p#QyL^R1++&jfyZoW*(%4;bNB|j6#tSDt
zmd#~v45Q(Hy~gT4BIV&X=Q{v{qx#(3oOj0yck{GmIgW8<E{UfaL7*b}hQS&qqdY!l
z5J~0wc^6N-G@6K*vM{f9#%+-UrvC7nn8VqhiwgsuV0{@u`1$w{F^R;1K~=&R*oLl|
zuaQh!OD#a*_VyyG-az5bBTBPPR717<RD6If9YjwpekxeH)C!$09~Au<49J&w)P+%V
zt)w)3PE$@A>!geY+LT4tbv6*vvL;ZP7xaNAE7$^2p_FF4ds*V(!Mh*?9$$uYdA~3v
z4btPe@&U!(M<<!+<MF5}kqAKrp^J&ks(U0(RS%m=^+TS5v4(7tSx?{KMK!kd(K}Bs
z7aJd6Vvu7^RoKjbN_x3LsWaq!n5X)vBD!A`74FT!Z!(UD#DB1m`0c}|smb894ipmN
zcYzc9)o4+0Z7QtTL;|!)kMuhROj1?@Y3V{q(q&x6dXP5WgG30coOC6gSiHY1C2ClL
z2Z}*LctlRNGGg0f!%pi?4jk0@vZq>ZLjFRM0_NaWpyocsT0*^u&l59j!{dBYKHe)J
z8KAZUJYq$~X)7?b4o#5xE6-LPFVi2QzvP&pa0K}Zlb&lTXTo;-d^H4%BqVV4Er-z@
zTrV#=SFcB@SW_9k|GvB#l6*XLeDaW0zS#Jh^{!{lz}XwQwxsO9Z=l6#e&jQI8#{_P
zZo6FuftwQ)eIjpeS(Ih1{VQ9YL35<dHw^;^W=(mHc1KZNe_RQ0p&bvb4EGh^wP;gv
zEMyTBlm7G}&U^3&N&jY_1qIhAu?|LV5++SdBoMn~R!zxkup<A;6TzDDd|{tSw?BM;
zIU(S9-$z;Ka5fR|2Um){NG)<fl!aacO9D$*^hZh16VbnW3n!s?fIQ)pr*0%HyZtvJ
zDmp3#=utoQOdHLQsmq5%3EiUzi5iV>@^$|!#n3Z=i!AV14Rl&_64JZVnkIV7!D8Oh
zq>B7~d!I=JRUM%s`=%>O2G2zErz`a#wXWxA#hH8?Uj{C#`d(dagjXS^l&&=~hK~6a
zoQ{}qLcJ_mO#>J(fx~*}uDr=JN1VBbA5o_jT!f)N#Jns+l&=)b()K`}SdPr3TpM&j
zs0!akk72m^1Ea)pFD;Nup+4Nih|n1Fw1!$Gm94(;D`+&O%&S0)qc9S;5eU5dIY9DE
z6e%nWO^yxgUZCkXTCe3&atET;)0vr|eNH?1sFKMUv&@Q*26JurvN4dG7EqB+N_lUg
zS`onvo@%<jeSFNwkIGe~C?;00w3<}xmC=@O4M$*G$diHnqIp9w7rAuowXi#B3H57H
z?JHTqs|<1Wssgw;(HFxz7B&}KUWbY2M0oIWO8zBg$-@)p6?W03<)N=%<+>RnTFB<T
z&}4At%~whFGc|9^2t7-8G3@v_v!SynoEn!)TK$TNjoXqvIq$J_s7&d5Z168FO^|el
z$y0vlBYsF`<b6aX>;e=Y-)q(LVMG&Ig9nYFhODXP(m!&%40MUtZ{%iW^jp5*U`>_t
ze@!MwbC8)m2zcI`)||P)m5QmlFwxX7VFQ_@3)-ed8HM65Qjf%4P!v>3iZx`^ibu{T
zfKaHEshWxL#AzafnX2Q@f<LGl_Cr3|N?P+I!=gLeI)Hj%vULZsk~D_mduA(<P(Y;g
z6>6r*_o+g`CUxh!PXC!>dvb^07<?s$yKsbh&85>LGvNTd;XP4sxR8fUBu>*A!s~~B
z`?kaD-=E1Thv?(o{tO2j>P<<=IS|If<G6p%;rP$`A!Ydn#CjQ|5NkXt-74W`WeJ`>
zj|O^0<Zm8$7Q)3e7RiEU?jpaE=VQ_sMSyGrLdpl{Ej}W^K4k%ODG+}*+nCZ7=|Ao_
zN29aEMY_ky-Z|NSb4odN{mSe@VDDMlj0tUh(&Je0Tr#LE`8r#pZna`Luq#BdWokmS
z5T8^i!LKl=Kz;m`bg#^(-t}u;kA6>iMrtXz)@+fJVLyIGzb96$-Ny@h6$0zw@iBA`
zIZn{$LcS$map(Gqnl5$sTlLg~R6Hr(4g7mfUs}AJRQ2$T79oq*9T`8gSmXXrY4Nb!
z{A|mSwltz=W7hxQ<#q2lA4fjdMo}N=CBdzVA!5;M<!s5n-)*LduK~XmH@O2xtDPL*
zH#69Ra|GC~twyzb&i$fNtAQI=t98z5lsaJf`#}=Njw@X9<04JFy#jbwk!a#H>EM>E
zswaA=t)##JP1wEU%&HznL*oFetuTPOcWbMuy4u;+*4EKY`$TPDD`?%zPW&Oq!6-UU
zp1VXp<FYX@Ft8tGe4(cHh|Dbdi?Ck3Rzv8H{YTIDPm_;5IT`L>mK6QDM^JqP(R>*x
zW4?StqZjfP|844ph29-0(fBM<Qw<uSbDWk3D!JF(>{q#pG4!&_akDnNzf}7OTID=#
zxU38WFqt9>W$^;q-i1#slX>DedWvjPS?>AvnO~PEb6j(KZG+b3%dF*xv|pP*X<<QG
z7;hZ(1qsWl&5Gmiw2)&*ziM~cY;#3&kkx2)UTxAGCre8)Fi}twrEku_DVod)WtG}4
z!0<w8g`kQl8yq5TB+~Rnx;km*7F@Jg+%YH82}%n1JT;eYV&Dt(GNpglSFd4yG__p4
zM^=$#u=z`v{bp(vX3c9(mzcKFM+`XQv=dDNW%$7%hlO}9u-abNRwmLFVoeF9UaXAZ
zh}dNG+UE7mVEbjCh%*QMBv_-t@wU4$@B^nb2@Ub<bNE%Zf*YjIUU@Z`#OaP23__9d
zJ?iXz95XF*1SZh;(e;}Odz}1FLJJXbU}+;;3Ig%pOVXG=1VPLi%DyhKHTqpLMKbvi
zc=&UYbO#F`ofUg}xVI1!85v56hB>E=Pq(tOKMTe)U_l6Dp!eI>#5g!v1XL_OFld0n
zAUb+tUiNM8H<jV>#W1lSyQSi$ftnEQzVIm<DxP2Tyhxl;pGeyHre*iZZufCVcL#x>
z-BjcxRDqDY2<j|#IQna;^X+u!bU&ki`*YYm7VCETyN_k^zb8HB*oSDA>|Wut5-SZ-
z7V6xac`X$$75|Bt1UmyBM2{_A(9Vk`<Xb|t&^UMpf}^Ddbvr@@d57asq}^1i&VS;u
z2Y|QmaHHF5xpT`2+jFvZlk44<{T-GmmHN^@BtUNIDi{U?SyEvr5;SHbU2Wo^f=9I~
zAstH$U59PjaJJDebo*e*Bx}mD^1iY%4bl?ym0Xpw*0TQDuvL#Hq2GGf(s%H=exem$
zY_Gk6WSjK>9K@7<x1w#L_|63s8rO)`AnlNAhtwO)lYX2E8<wgkP=6$X;y$G@O=X>5
zE(m!0{e!CpsP*IBSI2cy4*mfi0G;JlRAC@PFkq>QZnY%>wN8z7T_^w6$0<N&MIQKI
z$puWI@VkW7k8&LN99q{QF)|c0twuG`7Y$Sv3Q4{QN1?ONvZ{Bg-vJ62AwT5q5~Ct+
z<5XHmfVQ(N$FR5_<&E#^yanuAt}(cNO@hRYM4Z8|>v^OVmKAp2>|1A6p)`aWqisl&
zhd*mvym+IrqiB^Z`DMAzOBo`??jp2f<^QUFJV#^Mq4V?eGTQ)iusmx(VS|2nv9ZAD
z>UtUrvz4B@Q+PxKQH6T@0}O+~sBt7zwb;;n0TsrS^o)He09HPK-US5GR^V7xNC!O(
z=HCWHU_x7TM1(%#J5|p&JsmFc7H?tI2`o>EsJ*{`O#-11bWJc9H1MkmA8l?mYX3NN
z!A^!-zVH7*aiuakk(w8T74<`(0RbpwJuB;=!43!`8-oB^$11a)B*6-x^f-Dtr3O0X
z8d2|9&lZ^OYWgZBI^w>sC1dg??zx%?Wv~+4;CFqY1GjC^;J2YydS_mRcZk}W3g(5r
zRs(vbBCZxM_Rxn39!n%aXk~^KQxhI^6ROf6XE$Hznij--9>}B%A1Q1ADfCie;oya^
z8zpmZ1tuxsT^$|lA6v>?E`UC$lwV;>A0!o);WwPsIWk$JIZ{$X+|kChMvCX<`d}V~
zjY<D~y-*}cvb13aaVAvY13f5=qU+H*)OtugkR2H+ZASG2Fy(1)Ut7&YH9_>m%g5Ne
zJ@a+0+RHM9>2L;vibh7}b@~ZFSKSgmhREiY=KNh<-)}02q>4DFgEhd@hR*|C5~m|+
zuUbZbf<S@Nb8RH|C#Vg=-i&JM&q!}3A%1tT7y<avzJERx!{0|-_L=)SGPMwO{O1oy
zw^Tt?d(1lWT12(DM`C{dyro0*Hr3(H$$0FEgTA!of&vW0^pkM58rE-x*6dXomI4_v
z0b^XNc52C*oTd|<cz@g#w}CUDeJb9Ya)Pz{ID)(%CTdx+a&Qcmz;Pnmfen2-Caq<#
zb^nvKx|&_5?(Qc&kfe_hbz^FeEov`3pYt{x=q$KDu$mRf0MhbeC1`zBDJ`+#`g7M5
z{N#PDRFokhu6vG9!%gCZKZSGkDGy!9;k>4+f+JW$Sk|fw?lav?<kOgtEtX?q)VKig
zBsm+Ato;}t^6hdo&qTO58T47Zv7@xLkUD8g68^FIx29~)f3z^ZVhKG1eH-yJJy?co
z=rb4U>-Z!$?tI?S=c(^2f>T38RAkicOBr>fm;aBObj)@*!2sFC8O!>G;ME|+z}e$i
zqsRjIv1!Nq`@pa`Z;9J&xd88jMZGMW?*^O`5lS2S(x(Hl2X9UX?wkTHnzQ6d`$Eie
z6NZ>Ez7l`tkp)KI`o0zKSs4-01fh(pF9OiS&kBY_l||_*MJrhbF)`t{-xz@&#>NUK
zKN`X6QgwLB)b<+A(`<<VQjZ-gF;alR<CoW6u}J|OWJayDkC6*DZQUJeg+}BKuNmml
z(t8hH;I(UrTmuR<CL%&Zry?DubPlB+AnGjuWP&C0Do|^C9{7y_laq_S-oT|$HQTAx
z{qSJo)8!J87kgV0`R<})7Y#IW+z1jE50ts~_Cj$a*$juBRRPBxVmK|*Lrkg^SZQSd
zWw0smCL*YH)=*n$=Ti>G2(9iw1S$IseMXf(AaGYz0ps1f*_S`XyA)HA1Dz6<2V2Yc
zM62*J1I3)zMQehH?PXbwydF&V$&u*)5N#7<kyo`m3a>D&6#&-XDwpgp&jN^s%O!fO
z2=Vp&*8fp--qBS5e;mJrWL_#=qi~CFgk0HVQxTOF*Y4UkWMz*K;+ol`Wsgh7wJ#Z6
z6|StSjElrg*GShEe(&Es{&gJpGv2T9d_JD7O>ItA)*mkuV7d}=^C;R5390|U<_!It
zx31AN@^?e8D8ge6hEFornFAFCw3yp^JYD}AFx&1&h83T-Mzj2*ROtiT**5h47;Ath
zHtSLR*HnkE()jQs%I+I(MIBUpu9cesh>@GIGID$6f?hqEEkuZxZ2Ku!eC&xUzsn8h
z)<kD&*3$pd`g)<8@m3*tW}W55uZ5hl$%yz-^EPEP<h0g22gH#b)CIM*x6}$dRn>-Q
zjF9d(57?Q#T#i<J?kE|!2$^x8L7a+u8jbMvDj<!AP{?$;kq_iD;@QMmr5)qj8_6g(
zm9BitmTgJ-lva|tK~NCD8v0Ldggu$jMF(WaDGyU<D}f!Wmo9mR@6cY~#H*g{t(Yuz
z)&;}i`M|7#smmkPvdL<B;=og|;rTKKy_@rWZFiD6YLtH$nX3{YiY=oXzpaM)^<^^t
zz~K3Jm0K~b#p)3UK1yn7TZonwX$wk+e|R2XZi{E!iHz!*e)S1Ek)8#ry<nTk%#i<3
zA-!-L&4qWe=h4ysvA$WTqW2n;e*B+edY#&yw1r?8tRRJ;Y`XL|ffFq$vTe&132k<T
zev=S~zdM|HIyQ1QbU8Un3(Er^Lx1f=FjCKLT?Uo8;Jbj%e~tWStZ&*rInw)kfpvV_
z&gOKQ{(2gw&NIh8M;t{BJEeE0t<s=|CB#sV@OoA53S>+cF`#PK;)@)k9D~R=pDzP#
z8P-5RIM;RZXK4V)5PT`?9ytU+H0s<v-rF-pn;lZ5*$$7$7T0D_%o}8UB4Yii%8}gh
z?EkHQ)A@7%cDlk2Hci+z7as~1(h5bge^mPtE%OTeiQmcYM1RW*8jU|=9UycXd;~-+
zGbDRp@Su{ukz``jYKjIE=(X^!e9FF`GLXsyRNgOd6?)ecn2Hb%)B#Nhoxa%8@-h>b
zN&{aHsDy%m!6o8eW7-0<bTU8%z`?L8B_(B`K6$JhZ;fqEPjqK#yb>iFGL$3Sm!IFO
zFc*4*t#@(pGH)Lffp0e4PtW$$+2ZYo_A7%PZf=H=_DQ10W{fk$|I}V3_e%dVas1Rc
z<qC)YC06~hmE@bT&oo9`X3LdqZ7%lDdiD=e+iZyqdKyaw0qupmM&}7w^Gph?ZqPU(
zJTxb@`XhO7fO4H)#t^KsDt6g>Mwx@Kd*UVK$2*~&8OG;7KUfpyN^d3x89M$UV+Y=C
zD#TcXczgxR@3<7f7r*xR_Yc)0?EJ0rT(qXvo_fdUeI^WIo6Jb$5Pdfo6IRV)EYFzR
znK~nD`1?Ic@`@kns)OV#PnsNd%}=Wjs*SdZVf^;i<nbTuw&u+cVj#Le7=8ErcbTg5
zardVKH}E)H{fn`h6hkp3ktQHfmz(I!{KBVu`%_Q0oLj4t5j7N)M_!(uvEJoM(T_C@
zs%|n!O1b4xS>qYToldFhrk40VG|7e;01&;Iw%(RY(NK~IzdA!t7<CtdI#ZYNZ}0D@
zCXr%Nk%>XrbAj0>nhSU3A*RkXW~ogmfG38dn$C#cSY?+)-Wm;_VU6dS*gVCIkex_j
zAJe&Ami@5Mp|BmX415{)PBzs`8;^(P{^hr~hftysBqg*2VxYbtJR&qac;6>?!}<@n
z1<%G%DA}U}g!e4a?^5g<Y?gBSk$Mb}!NJK@I_L^X34VhhM!0YAz(?a=Ao`+^yyxxs
z*rrZHE{G_@h^kYh?Ddq_U03TpQVFwC${nCkJXuNHMsPa_Fv;*>kbJdw_<N>m!juY)
z`X~K6$0hd+U?qN!u_WV=>pmU@)1q8LU-HB0YE#qd$SN1K1*!$L1VWi(2lK5eo6XS+
zL}vR>pPpX)3g0+hu9_;p+d*<ofWF-#*U8u8CnT*cVxIH%@u9WYh_S+Odeath8{M+>
z8W5&?=H#tf)O$JG&NXKp(jjct@3y0fxJ?H2_jKFS!pQjYkgL{~X!ybRt@9A_>?~KV
zxsQ~Y&mDt<v8BW8Me5PonnRahtT50VvvPB2JFeSZ5{%|V#KKU(MP2!n7mb0s)$er-
zz7|vmhP3_AI!Dag-Hl70JBPE!dOEhY$s&}oeQWe~_sMo`_;&5_gdL?IrS`hPm$EUj
zY^1&z4vT*El%%9^9ccQqNYAWX*6E3dZUX%({${%X*tfT5;X@YWAmq&hA=t;IDfcg8
zg!j{HTmKIJF6ka&g<}8&5QEG_lsK{zeie3)1`|sPuZvvZYHe>{8w3jD{r!^N^@53O
zO=`~Z^V)xeU00vCb&q>(P4uTkY**P$RLTP^(GuAJTPbpb;5znAI$0^Z(OJh6pIPs8
zq2dFl&L^0|9WhB7w-$`hZ5f~_@c#lZJmQ<Wx;ikQErlwpcd1^BYWCp1{OrRL@WLw1
zrhakWo5;x_wILu=j3yu*iWQw4?lECmCTSN_F3!ePUh>m;{0$z+z6i7>Hi*>@!y|og
z|4sUze<n--re7od6t139H<@42{m6;gk&xFEVDbaA>8t5)M`}1ZIT1QnOj)1cNp`rY
zWhe7b!2$$E9<!3f_y(fNoMpSw{Tsvnlcr+`$WII&=Wp}kD&bo&7S~j1nq_WH&pWCR
z<>Y%&ks`nPO2ZBg*Xu*FHVi3c8V8IO%k{Hd=R|Pl0+I}EWw0j>j7`7x*skDgjVJ^A
z83xAh=0lZshcc=Pcp6Dgi=wGkRoHPPoAK5&I!s;ttiflC_e>CI{FZ<lp6ea6jNR@5
z(eV?qcJdKL;?0O_qp|+tdPj;dC|+4xwzo7iieXq`aUAn_4f*$))^!^l%K3D?TAK6b
z$X9bAhm{YG;{+Dv)qulj5!%*Xh6x!e(Fd%GC%Y%KlkMq~gSqwfOY0l!a}nE{U<|r+
z1a5@*?qHYXG7yg_lI$H1GW|}mc6D^{)R164&Q|}W0$xTd8F@jd#{qW6kXVw%T|FUz
ztk;Q!q%y`lhdj?MsR)$??amo(p$U9+xrJ#AzGPoU&@KhP3p`_O?OiF|{J*ZQ0Z-Ll
za1%?~%ev_U#_9dex=w6aqb;#)AE1VxNvCr$C5`{a17em@Y?8_B^7-SE;efLR$``Z*
zv;?pzWKQBib0nwc%1pZ-<i5qchvmf2<+VTbsa2Kq3IzQaUS|1-!ee^zEIL_28x+$F
z($p^QW<?yd<$v*MV6nbwcRN2JJGJR;^%M;CHBg-`Jg0KIRTd&0sa(zrjU+>RslP}i
z7A}&+foi?89<eim9B?hdtK?cI!@$JfCG1kFsxvB-us=Yh1dbCph>rk;G?>fOtqUIO
zKeHxM=8le=Q|{fvk@iD(h{?Ty-!?YZJ<XsC_$nNS_FW|#DO&v?5Ir0og~NKBEa$@Z
zEc5|d?cnb|ZOMlzO%XFtKbZYdXv-W^2Oj^@S=)^$A?fRh;#;L|^@N`pka_3k=CQr4
zEerJi()9=~tv%4)b_@cyxF9(qG$;r};fy1R*{12XuL~PCE`j-%-KE#}%K9f4U!*=A
zTL*O?XmM#EkJTHtO<oIw1*kv~BxSS#cA&oet~Iukm^-lauYK-FuSmJ$;i$@F#u>4L
zRG(zT@j>nJ@133XI6<|em6V7p8v@EWfk28W|2ujpb><?uvg%=y>xO!$G^sd4Un*ui
zm=mZv&a*TS)&r7tvx3w*FEQ%03T#X;EI@QF*H`<;=vfL{pM6ta9nIzz97YU!b)D2=
z?1Po0tm47+g<Y*ULyE!*=U9|BG$HcR$zF!NAb?2D&&oxaQuOcoOGp~j^>zFPwyYrh
z-)8?J)!Q<gH09Y02I_oA{%Ur7vLaQPvu>lN@wIn!$>6KLr^EaUzkzRf?=xucA&J|y
zwPII;X**(kFWpxOt;CsM?3|)SIkq$)n9eZjY?^VoVQoeXWL+_1Z0I@?R2L;{Y?I@}
zcZ#pb=~*yd6wVVK76g`{oSgg1J9;s`Vl<hmXZN}~5zaYGXd5w{?ODvZ0ZE+CHRkzh
z6Pdfh=IGh;YGYbGQ?*)NHWueKA7iP!s1FQ+jla)ELLT7liu~2|{1v>i6w{{oS<gb@
zFtcYx!mD2ne^;D-zZ6}LLbIC9saEIj2m8KV5nLIBmt+ZZ9Om;w0)xUw{-TX<UZgFi
zynfZCdd%|~d3d}S=M}!Q1}@iyvG7JFcF*N^Q<b>&<8^XsP*9km7+vC*Ajns~)BMW}
zK(2`z5&^2E&@)6gZ#sD0<E7%;@$b;w34iiP!1$TQtR8-p=?jkBP!RN<4}r3%&2%09
zuJVfYcI+<Tk%kt~yx+@P6d~>3HLq#A#EJ4}rB5z}wib|_ct~IFIF&XDz!+AlcYabw
zacgqZKc|kPJ}G;JeT@;P6!||h6@rx83q_WP8Fn4W&D);P_4v;`zLxfHiSwHNe@N#8
zw`w|WM!_djyczS=!V+RHmJxQDM+quk9%5GE<2*5<2Q;YdS$K3}uk^(PMC&*ypH5gi
z={fK1b5j4@o>s^(I}RlLTRKLgJgT<0w<khEe0{&I5s>Z79U;4FffVhlI2_rx#wuL%
zJ)!)ivq97*`8qA#m<5qxP=Q-cN5x}(w$dUK2AIeBR44%Lu<18497O!yFWsk^b3Ge0
zt!HPqq<Y>}KAXq;u7|1(gAwTfkmGSJ^&ZA~bsaA+uS>=3Q8oa{xL@!LJ3Xmh&L01Q
zngw`dg9FCsqam;GD%lBWT-<B*KH)w&S7D<K0hC4z>C1k4#AbsRcr62&K=!`G%n_JG
z5ml~14rkRaV5_{S^?fi;4h1}pn3T)@Ly?WdyMB=YT_u4h1$Pb}&ieWKT7c<zTQ!=6
z{1L``rECJJk~}uFwwCg`Q-b=t0}J4SKMGZ5=8^#P^Wt?%Yd^Ew#LG8X;_%jtlY{Fg
zf8#d7hkK}bYN7w64T})xw9lnIl@qUf@4o}(z;657Xfe0X9tU4QWy<{yM`5wJ8YQ$8
z{7cryk^y#MrHvL=_+2o?T2#Cn-zbXJ@wsf&D-F5rvVz@*61-Js1Rh~Pc_C%&q}s4W
zAn_R@DoS|Ku@0;F*TnyvleJX^JWJgMOH|cwQqobF!(Jf4HTW#jOg0@B6+)Wz4ypwv
znHy`I;qv){Z7!%+X3+==|9)VPuDFmp*{DnbqvK$_3o9fVrU9wOV%^LFqNZ{~C{vZb
zJo41?Gu`fkqV(~9s4K9jVWoie(4TI3QM<s)Ym)_1vkU&VNf5rO4&$4r___AML=qI_
zyEFU{DH*%vTwM%~6il}obGy9axzL*|IOIKHhrC4+kEg%#$r01$AF@#*&!nm0Vj~a>
z?GS20TJ}X2r1U!o>?$;Cku8cEtJd!tbqE(-wq7VtYPZ+?F!b~E)eg-JjFn`(IOjs%
zc%|@-U{ClvO?a_W?u|6s@lijgo6Ws`lh5Cglj!qVw(VO;#EIMfdOx_AHsjP!ypDIe
zXd~_Oj4h-uLN?#ppZJDQJG!i;sPzEt@!^-;=}0KP0ilDQ^{t>S)${)c`C6|5f$&gf
zpr;vD`3}2hia}PCAh<7|gzS-!sHRXghjTLay2|P^>Z?2L-Y|m-ky_k2^%!Vuety7`
z^#q@xmw2=xy!2FcbE6b(DSd1z<d%cx=nB11o5U$dMW1zK4Ubq0-1AM4Nn=_+X@4HY
zJkny1)s~r7X~^ZNwSd#kUvJ}C`yi(sP*hAFkD88yJTrB#qYH;8SjRyNZt!$N^}Tq_
z&2D+*!?~?(Q2EdHD=SC332csEVN|?=!;|o<7~Y`Ix_-LaLeNK}p`X^CwI4&hPQf5P
z#8TYA(%q{80nNJs3Pj5Vt#TR5FF2cJu=w@SOQ$W#Vjk#tdT|CbYi*dV=-D$d#bk_a
zrnuP46H4V)R-7fmwkP_f_WlMVv$q!(-ew&9J*eEZyP{l3#BMuTa>m_^cbzJ`A)BmC
z4a=nffrrrE6U9}qY~6_av$V;Yru5oqK=rgB!Mk5GctiYGJxOgwZFT3ctiOzGcfPTX
zbdjTX;r+vtEdLm$JM+ss1laDu0buN}R?}tV_jW6ByX|r+EI#>WcnI|4pv3+hqK<hf
z-Nns|f+)iSH#Br!KCukeenR>PX3<B0AOKK!=~Dkk7VnTdcGVOGPx<a5iVHdqLDKcB
zOD9`p8^hT*c;S~96+0pV_x6sD6q7dAH<q2kZeEJm+obLLh?NL^1@O~SH=D760hjfg
z9Eh^V@@Z|}ByXdVCQ>)w4Gxq*Y<dG(Yv=H0d<N1$OeZRmF@fcW*>Ht%F|jS<ePkcZ
ziz9B<(+g99{q2)#K#F0<t^O=v<BgPjtfiCXUxDR%M0w@Qp|(XtFlhLZFXyJy>Z>IL
zP+PVTVUB32!q%U((3%oH)YHQw*`(%XP)cTJXFJ*8l<OP$qAozfQmc8+Ch9M!1c)h>
z6;OBbD?gEHsYnwMxR&vWt3b_WcMn^~(K?)fJnz0<T9`ss1b(|`+%NSvD$;V9tGDCd
zOAIW9+F~9w(w->xL4&c_=A&>5zTJA6(tm5h7)klm;;nAfZWFKB*mqiEx=IU!ZOUlq
z3@bGIY*@|~8(Dg6sXhAO`SSGLQ!<1VO4>8V4@&nOqXL9EbjmBP2}$6~2L5fWoE0_-
zP+%S{ITcmM#F(q_BEdH~B!Kk$S<7-Yra<{UFT=BVQ@a{^DEodp`Bk%&y{1;t?Le_S
z=t4mvSrC+p8|!}BjEI>t7SN;{B-rl{!*-V<0FWqRpK2%k$m=3k@M#`Lo-1cjek3N!
zq`#q?9D!rx)T95d$c%(AJjP*Z;$JLfv|U^nQww~%Pp{D1z}h1lkw8Sf^>-`K*H_xn
z^kyUj|9g>n{M?3~psG+qBbnuGAf-hc(~kW3V>-RD)xSCZ3SPW416(kXYeSbIg_bn&
zpDe^<*@K#23;tY}AzE|{KrSX2#YtjxIc0fE8Q(N}YhR5-f8NukhOuIDfw9d?oa7>J
zmni;Efy40Z2Z(HC=h-x04sV_H7@-+==jEE!D1FIKk9jy|TJE3YLKU*Ux_zC)6;nrO
zgM|rvCQOcr+?hi@gfjLCA>KS@IW%OCnij1HQUA9#c~aT1MT+Jmw$6HTcaOQ*X2}j4
zpeL00wH>pZhTt+B-KSMJqH*SscPV(>;$IeeL^9!k5N4T0cFJRNu@*DC_;pbcqJ`ex
zU((G1W^y1AoCAV-O%tq~S?2@^$ljy&=0B%Ov)Bu~>&ze?0`~=*(BDN)^^DORC4y#t
zUT!PtN5L6kSLF-5f60uL4qsBvt{|X8{dtyZYYT|ON-jFT3sLc~I57K2T9zn**W-vV
z-;jvIKTp&#he(lZfRTCi>UEYA5J4?l+oRNlDcq=z*6%k4WQOIkK|J1be8<UN_u-xr
z5F#k%lg?`t_@dZQY=UI<;)4Hc>GyZ%!XD~7#d^cA3WrBq+Yjy50!+NZIwklUF)=!j
z;?}36saa2jB$bKc7^shtyWsyxN2kdwut)&wC8V|`BGKYgKdF7XGg{a3xNYE$9vk>7
zLFNys-lTLyVMx5D*LgmfJdzn>`4oJix@hPgp;}mOP#ADM(Nz>$P`=b)$+<Nd;ZZ-0
zUxP%;u!Nv%(%Ne*_bnG!{J2MQ{)}0bXU6JzYV|eWe)!;`(sNkB?$nJ()p&fnb|kWr
z+f@HPvjO0>XfykHNZ<c)sfkmRo0l$mgdb%V@UTk$il6h-8bFxP&LoVNM#Yv2<OsJ7
z`g*NN%KoHpEIl7GIV9JoXm}R>(|YTW52b}6?RR`<rW}PUuwUkjW}a~uE16`1U%jtC
z!D#BWL4av|wZ55#g&W+l<hmm!Cut^(g20|ZfPR&AbD{V2g)m}?_XeIHReb}(5Q8;c
z7@9fzDDV$w9zuTj2CLnE(wY#%yx|S}v&!kNbaopCI$@m&dVV1x$5ik5IkvgGV}VM;
zw3QhoD`|22m`Fc7eBCbWa9eSWST;${g+!*My<mHZ&>8dG0;=^l+!4;6VQ7dpok6Vp
z&CB!4c7G8}Iua!beA^Z}S0t&~!*<F24V6!@IJ|<)_;cMnglwOpk;=yha^FcRA*}^=
zK<2kqrXCJNOCRYUbLmfVm3lguXFKT|*p89veQmH7*RDlOSu7Y*{xtEe42b)%S2U3h
z=C%B~Up6=nPVBKiS!&kNh~~5?9QY%jUt=H)hr`-D?XP^8RMiq8FhkyWU$EEO0{9UP
z9k1puzs2%U%v|h3^LhcJO-8&_{I2j-2?#T=px|lIDVO<96JEcH7$>$F{!5J4fB%D^
zbh4#y{o;e&$wVzl?#3t8>-SQI?K)NIK5YixbrV6MnDGEGI_iEsv;SAXqBQ$hcyj~s
z)o}K}(u@_SuxIP}lAnup8Qeu(UF{07wsy%xOPR&P-3<()w)Q$M>vb4y+jDA&CX($J
z5kf6k!}6l5>)_Dn3#qjIQKU^)CYzgvrOV1pE=X;8p}z*e8KE9#!+BGD#+`WlY*M?i
zftV$O?JK)iuU}d29UX#FAfPQ8i0QHgc)uulyD+vu>oQ~vUm|vv_6?9@(tS|irYjpz
z?z3U`QL;mNfKzCL+y#yZa1ueIa4$(B!x!AaV96}Fq<?cI&~|bd%&(EEH#s(+1iE4)
zrFqBO6c^SDPu(N6c^448CtH&jBQ?a9$W!$a;2&OEvfxi?Uzzse9FTdP{2%-=t};}R
z0g0BFaW5=A!^Q3qBhSsW@#2?Z%WXpZpUjI~Z`QKPu04;3xp38y%=P6RQ&XS&ube^r
zwtqr$e@&532u?&WM8qSqkXG33N@H)}DZ_q`Z=0G@z3!f@r?YV+y;|T;Z(O?{^Tfjl
z5et#8`dcrdJh2=^RI-Z`k=f0d%9N^4An<DwEV$m2AoDm5viQwX(FC$e12jDq`hc3m
zr^{dH8+?BGT9FCb&}|K0`m?%$jNG^hFF)515RRH}JoOP))uy9ej94WqfsB^OIPY^B
zvN;&HnXo@BF-=p9eHMwaY{pgomgW#<VIaD7I)h+(XAYpjOQu$+_4g}d_ZbRs%&vG3
ztnFW2!9Xm^%qHd7^ou8-w$PjJUGOKZU}7?Mi4!t!BcxrKxmHLgp!bvb8}hjXhKkzl
z4l*chB6vBO)z{evnhnkK<oq$g$pUnY@A`Xnp!BuogUhZntt_tt%-8HvXz;<`4r`uy
zPd5K$vNAkr(a$$z%*rk+Hy0~MFEOfF9H4cDDsOeAp^;P|yip*ReC?FI2cG1N^_5aI
zYqKlzJWu6jU{ZQT1QQcdhI~p!Bw^<9{EbIq+bf%v(UtegOpHFK=U&fN2Lq7oG2WK%
z&8^S#e`#niSb($vAh>TN&sP>{&p66^)t>A8yz9#E7I=QBVf21%wZ9a|7KkqjNW<u1
zQdx+#tlyfH#Ck=4L)zCLw%{-78*_1Yf{c@uX}Qfi0~by0JgOhCqoO(38Dh+e9Fk<2
z#xF4>z4D30K!XK)pxW`Sn0GJQRMPz=z9IPu3v)OBt)1+49lJp7gIJBnAKb;fnJwXq
ziiFeujX2spIn97zDq-F2l%i2r96vP?Z9V|vyWFdh!<PS0(CmHVF9l=WL6a){D1!HW
zshZu+<pLiPCx+(9_aA7Zn#i#Z%-wPGNhJZC8^C{Xjc$K44iE`HGFs^uikbypFSKk2
z?<ug%V@Dt_xMQ(EW{rCi=gkMi#*H-M5UBrHA&n$i{*dPBLSk8ZB4-V$&6)@8(Q)R@
zjw(HxJ?4GsHfW_s8>e|?N-_ra-yMlYehWVP%nhdETeW>O9DzuR^O;-T=Z?Mq*F2-d
z)r|NZy0)wGT*X+w_36e<I($cmt(0%Ya|J<8u4fH|_V;sZ7$5{$EnNxiWLqRE-@2`o
zErL}l8QUzbV+8v?Io+=u-tpe^<V_~C&a3=VNw{+|F)_li-hA6zBQchv+=_HZ_(p1a
z{1;=3tyQTb<O?)rlgyRAqsHkCi!Bc<WVX^KD<8qo=OpGymA!@i-t5AE%rkm*91DGc
zp}_+X$B8yEX<w0NI}}KP6uB-_^<{B|uY&htu4h|v32$6X6Duh&>PP1LN?jGAOQo65
zRiEgbZbgt_W5|h@ChS4+<(5W~8u6HN;ke+I8@|Yw$Bev1RTHs`uAa66@pr4RlW@3~
zIeVLE27NrjIsO!Uh&EU#S5ZZZx=n7+Xwbi^{YT<1!NLU(gj~8IrajMZL&lqhGToEp
zNKHs$nJ+)n0umrkUsAEhe5$tt!#*{M)Hl3w{WHA;s+-BFoOaPXXc}<5rmYvS3J+%M
z<|OSRl65XI>`nP?6rdhyG?M(B4R11P%E-pJR`bRol238rklSlS_a|c8Ys|~!XJ7fI
zOEyJYzR^F`Wc}TG3T{dJb9^L?c^$qd`}#WH(jWtPbIuqpcFq}~C6~@an!@k_eM7G%
zbRjVq@1M<U0`I+1S?OrWHDwcjI<CxH+p&YyC8D&pSHugb8CfG?9Sx!Ao{JDE3Js`r
zeQe#a@skV^4D_xg$`qg{Iqp_!%p3Q<=ujCyVv&_c5-AzCil#m-A}~B~A!ISvo?SK^
z+`qt(`G}Qd36?^r0_#7HxT|^g10D?iSE(-#ORt0$U1?U2o_^}2P<Dm&UvbDkw*#47
zkC{q`mkg==T^`wdXWdc{2xqKhVn10@tNk6ZoDMmnl%rUom9<YuCf8QB-aqTJDkw~+
z@RvN^J`RRY%u#zL5}E(5+;EdfYIVC*$0`+D-m{nS5)%F70&jq~0;b)YKE??T{XFDw
z^6y`;nhn8wi)cY%Q1(TO%+9Q@dv4H`Z#%Qi&IrW+2~^&;HTkXNLYWz$q6-M1G!^jV
zY|SDJ^)DR7?Y6st&h@o=ggB0g8thflBn_lXm3p@)tUaHM$@+M)(0vqkuc6jUiItR5
z%H(RXPa(37eP4Rp<MR2jQL18hh)L*Lnv*TSZyzBnP$3)o_m>K=MH#+Sj~PQC@)~Me
zc7RzhXpsEFg2L1ahf7RnRVJMnBVnlcZeDb}PVw$I8rWP-^bbfoq;pnOEN@H_?E2Eq
zjyz$Zx$DqIGB?%O(#|d=>3sG2OxurPocu=b(5#J-66FWpm$+>0^TjH1HN5t8&6fa5
z5Kv3Sy%{&QeTDVM<6w&mPo5AbrjTg8XdZ}3MTwaX29dnjGJOx?iuK6vjL`UyL6aFK
z@yXTwI@XNs@V(~XVW_F=%#_yYPfE=jH5HDEL2&5ba=RVDOjq+IkZXm1xqI+U$S8oA
zDyzjD9v%XK8;JVpL^)T;q5q@+8`6Bi<h|F)FOde=30NX?LR0x_Nijgq(2ps86mBAs
zoSYnVzvZ-&L`xo5ydYJU<<}szD%V!=&E_vK_1nJ<rAm<K>V_uZp|5k!EA3slk2a)E
zog)j=>j2TxtD2^9QhOso0!jaZ{$fj0a%{owRr?PR3p0K9n!pMV703H)bcB%L_TA`9
zX9-s=kTtW#5|83rd>7pB?vhm~@6V?4YGsg(`Wxz;lRn2q>bSD;YQfi#9$2vxofsWG
zhOKvcB<9;#GZ7oyL^o&`=G?mgY(?g(ZY*;=Sy{4~1#uN}_jaTImqg?#Mu(ZT@Ez6w
z<yW?^?I=!6(ST8zvKZs21@qdtWK@=IE%S`|F_}iZsLAO0t6F#Tm&iFf3Sgb;h-Qyk
z{&G*86UA>Cv!WYre3Hh1G`>bM{^ZXHvVZ1z2o-G0s{fBBx+8LfInwdAg}Wqz@cQVV
zOV_{s;mLu|o$N2FOcQcjoAXHiHlyZZTTj<UrW3!=&h41-Z~+k%<HCPiCI+tycM%*#
zf=ta~oGZV%UMhG^@;vUf`M957Xc*3?nlSDb=)T0Eexy2$7PqQZNEendzmB#-o@*fT
zXIfW2(-y(<RW4}D0vtjZMwrbiSpd$P=-3i)>$Xt0Rdl@sYqVa&<8?ZE=uwGdmF&Z%
zuFe|rp>abN>dccrH5?<#7no9FN`8{78XTc252*;klM`w3?8T;uX_)3Yw?Ws!X+cl*
z(7+3*YTOa6>)+({lzWIihxo$$sjJ)c;jPwx+b-Sf&(6L;k?@TG0Mf9V;b7IdzHT&K
z7OMy`_UJf0-e|7vY%|#?YTz8dx%}i*RLVWO4E2`QRzM@mBV4xwzrMKSbLj`@?D^x$
z({=)y;{fM1!<Xok_Y?x6^s@ZH(fe6R&@LJ9p2DzZH-fnAo9Bio-(d~>We%_|RkiB_
zf-h#M1Dh-Rxy*6ip(pEM%;CA&)_;$VzzpW<;9%ZJifanxC2slXaJN^{o$trOd%?K3
zq#0hFpEAS!V|7hCLke><v$HqDgOm%Cl`_fgH97cQaLfgf4BiP9B>xA=d8%T@^7sK&
zh~B52f7^{sc#!t*-`3;8i$9=R_sJ!=r$k<^-9tQWeeWP$AEAeRd{?LYkZc(wGtqu_
zq;cBgG1OPTV*8RpJiWe@n2X0}V{vjwSDJgw?c$)8fi@u_H#Tk**3e|G)LSNcB;*d#
zM9Q+sUV7BG+I(a$9on<v$CD*Y04S#CqNFKOp1eG7bIE;i7beKT?-!9^#sM=Kc`zGz
z4*TGI_=1*BH{;PX5#L0bDxDog?m&CqREXdc;{zV_9%tst#um_UJUY_Oqf8L=nyk_P
z36o3=)>({cT7PaAvnY4IQ(<nA8rbQ*j6{vVI*()Uyz8@(Zs%KkH-K%lbVmNpe|+c|
z&CL7d19=TB!|s*l#2}lOqyqSxno0svn_yL2D{MJtWtD9ka!W?KmSlNFmY9x{VsK$s
zFq{W)-`#HHSuG(MIGa{sNR{Mvb==i7Iw8vH?Aq+~9s?ZBmd^V_I%yGHIA|Oxglcxv
z5%T-x5=n&)w5ky&H7D<054)gNvLm)1f_GgWS->++*j5T8)_!ZXTI5R8ZF2R~{yja7
zTu+5~NBF{G5!rAFQt*Z<Z@rq|x^=6;!@;C@F!kSy_@{mp2b8E#99cVC=w>ol4~IB`
zeVEKH;LL~<XUXKye}B3y&c=QI!K1Nxq0MjE+zL5a@bIVPJ^lIrMq;MY9So48CvzY0
zRk`}8=@F%k9%!5xhY530o$ZSCQ5L^#>sknIqPm+lXebij#z`NqNzZ`B;`9_A0W`kk
zpS6e>N6c=wWg?*iUoD<r?6LIU6_gI9Vfrfz3RC1w4%*PZKes0;ElhvKQ17$Y^#LE%
zOF&n*OAOS#@#L`Z=Txu6-!Y@Zh!gb_yXAp*%YCNqL%a~=5M{vamG=8?3x2{#NJBp1
zD4d1(QFzs>)NPBl^g{qpO)qjS8HdJ$Y0pxa_45A8{t{qY_!8X&ZfOCHQ-Q(f;<Y#3
z&7?9n<(V1i$ckThP!Q-{4v&t3)&B5^12S&kDB(*{OGa(n#D0#0E~XmmquU0vfx-|u
z5qGI=!|S>~C0Xb2XyJXAH~E<cQ7pS}W-JehC0WvqTy>gw6+~fh9AjAOfgjGUgJ%&z
zK<Wbs$x9PYNi3R+C#kus9elcI<Y)~_X_He;NmiTLZk8i+|B+`v%NgK<_VncFJ~T&j
zxdKBR@Uc&l)h3PQWL<sQ9nOI!So274Ofh(a(b2}FN-}vQ6-a0B&Cqv9Mu<I_p0)WJ
z4c*JT`@rhrk2mK-O>a*Yg22S22dB)-IBvV2x#4K6Pm`g9))P_@?Xyx0cKSn>MdN=J
zlgAX1Xxz8eKvl8Rmgs!K%Fd2n9821RM|WZhH-(uXS4)T>quzI87FqTv&fgM1VO*Tv
z$A5xNtNiMCfNsr`@z*8GR>c#)BlEwU-qQV6S*IXlCinSGS&qIbvI;Jp`&lIpirk2N
z13c8L(j0l;9U_1G-(P7afnAd&`kMtRDT;s~WSB44nB538<95?5FTJNkk%xBy-ml~B
z94ixD--nUNXZL5Go_pl*I1<3TQ9pRRi<$TbgpAQH<VL8tE|W%5Un2?5YtCJO@$;0x
z-Vl;`BPZYYYjDS<(3`%8nWU_>isORa+L635%ZEM^Mbgnp>ar45D45xV7PF3#5HB$E
zt{Q?BCH(~3DSM_5L9ezk`V|~$;LNY8?E)i;Z6YFI?aHX?IsaQ!&NtJx8on)f|8Dnn
zd9G*PaDiS}SbVsmZ8APU+wn7V{DT)Ak6(mOA>69lLt4O(*=`*UfY_26j38#klB2u~
z_c3w-yPO1L?DN)Bf6PateiiT&-Y$v43kz=#EA2WL;Dq~}Z1pS1%B&AxOvWyr30N=&
zkw;m7ORG^#3q#JFTUq~M=ifbeY!iYCsTzC)fqZ)I04sH9X6<z?{IBRNM{^v#ZUa6q
z)_2L&r&6Suq!V2sA5izy&fOe=P{5}Ozq2m#ryFrCKb8*On7AjD{~Gh)Y>^pXuMk8p
z`iu6H@ObY(4o}Ed8`$MwF3%qvX!nXieAZ~|8fV;edwYL>zxr$2Ia&FZwD7hR4<q`P
zrhj3_|5W?BVO`Z!+CU;RP~Gx9i@nVqonP9m>>Ci$g72c@p8twd95-JMa>L%vb`TJ}
zv_6MFX(`?MKi%+N#6c%co^3I1SJ0h;{Jj}SKM3d62L7H0xdocWX5UnYuV23|w$!^I
zPxz*SmhNBiGeYx~c`K~xI$BPW^1~z1SZdz*x$ia51SY9pT^HjPmY1EGH)R0Ax_|`L
zHWJf`Fi^-2vdj{Rv1jx5+?c}y2;3}~=Bb4pnEJ>Q$g8_{VgHZ*2ocmGIu(G0S$=KT
ze%ntLtw;h=!A=ToG(0@KsQB9nz~6i=vx7RVArJf+HZHCAyEPa4TS~=ZI6HT<aIPcw
zjc_=^s1<eidaW3i&;l4?U{Nrd2Z-dV!+`R$+L&KJ10?<2ur2WSz{?qc$4f}CF!Zbi
z-;b7>M7_ra_rRPq0(?3P&=q1Oup)MInrQr0?448ZAR|__SB99}ng|jzA)X$2qK5f$
zu`fxUO3g!BaOa6^dO$Dr6gaJ6>aO!ym0FEMwwvWDm06>8uTZYWeCP>&oN28;d%OCY
zXlY<a55%VR+Ihj!41W_Yx>+Cj@>6do#No{OXC9t>vm;-Aaht)1$X8ty^l$ykGv28_
zXUmXJ?hk%n;$nZG-<lzljQT^?ejlrk1vt4!V&}QAIO2XfXmQ_YeM!?tkltm<ogFvE
z!kFwGoa-Dv1aaIg4|9vUO@b|x`A@yQ#FTgH-AvSv5@8OZcTx07MtT}TrgBNiFO3bV
zJlaLQw0su}Q?$?VY##;;#C}ySQ^+S)Ux*|=s^;G{fiX#2r8G25{Q<jrzuBJ5j?v4=
z2Drb;8FE$NEs;=Cj<>+Z`Z~DKW6H^6?ExE{anX3urIaFfNVBZWcP&Rn3Q$3(5et>2
z++Ij4Qfl}V8VPZS4h2ss14<M#+%k&}`m&s-yN^ape(xFpyZJiHF`WhN;o6`W_F!*i
z{WRP$iD0oyrpt+sDcKhF$$WFU{O8#hrEc_G=p^4upSGlYE!QTT86TcYwh|K&@u<Bo
z!kP$?rAsBceyVi&&~i<`+%P|jZHL5o26<WsWg`F{*_2*loaXEI3KKV_13YJtLsP45
zKmCc~-zl9(>&*PQCEKM0+Xvd;pL}Hhpo7A)0`ZJno;>?owX5zK4BuE50f<|`pA2^0
zzj~o@BdzuYa-?_iKD{`;TURJQn-$SFIC^Cb;qbYz=G<whsrwwS#yPXO@ehzc3WA+S
z$A|rZzAU^Tc(QlPIk$9#uYLMr4lhGLVzqQcX*zgFOg${fcJkxllaK(RbefyxvXg>Q
zKtKTTP%vU|p?&UT69|mxMpk#@8Z#EXiC9G{-4%i&M{j!o=k6|NJkUG?PNxsh%h70K
zi!|CbE;aQo7z${ta0M?D7V(MLW+YnDO8cIE>-gV4PflonQOp(bZy^p;+DG%asrdVe
z`o{VLATp9csC@r`7&Bi;16V{EQq+@;wd3RU#ePm|o0{6q11RS0tY@>O)5Nsns$y~i
z`r=MNuXLt$jM9=SC>=H(F=lr->Qmo(^XXn8%y)<slYI55s@H>yDzV-kYKmxUEIvz%
zcsB$Xz*!Y3(A1EJ_N^K34sT)2KJvw1k?3E1aMVF<vp{#m#R01L)Db}bk$COuxz!!x
zWdfY*NEpz;t|7g!O-?qWNIwIBm<%A<EC<;bdoV&CF%RWbrso<@@aZPFYF>B-Xm=H)
z*!cF@3}XOi;Ec?s>0&SNi#wI$D!*tS4JLk)*4e=S{{Hw0IS38Lb|jWaan>!e6&hVA
zM|lEcf>@B8axa|6mfs(+@n0Z^2!={-S6+KtkjNpA!i-AlxOxb|&%48;0@PxmaoxzW
zl|>~Nvs$`HU#oBF?pBbPASyegZysy@I%d9h3fjb@vMYzxan2Kkn<ZGH*yZ`kK3j~b
zSizwqgOz{-pclY6v}9bCBF+X9ylzrPkt_IGr3vFW*dqb9g}n5HGl}M=tiRdpJ=;z<
zuo!AG^jTK%xLuLJ?~q4?gaOuhH-FKTS;O)}a$$=vmXR-A&K0Z7nzZAtzhSfX{!P$#
zNO^mjuV+E2131!Un36V$HdNDJPZWiKk+YlgKts_5D+jla2e!;ClJ%5m@Lw|UB~`I_
zdR=ka_@c{)&gf6@hQ+Nc?WtZ*ynPFD&XLO_a?-54*|U28VA|`4^eX3RybXvun@FW&
zO&D@lWaLJ_>-x=~nIn7!y$U_8)gXoTDjYPgPl`EERH9bjpkyzWYV}o#<N(I^lJU7|
zIl`7G#OCGQ`-TsM0|lF(r&W6sLpBN+V+ztC^$zgGUg<T^YVE_YG{@BmVJ3;B5VYJR
zcn{LrYN!R1tr^8WW^nqVyVWB$_cow761hFAQur-nl|d>e=+y%bd!>ztLGQ4<&uUk*
z$;H8F$S*BsZ)R<_-e20|I-xV^Ec1pe_n0RAH6fIn2DR8s((=G+$6?n|+{vGlJ;9S(
z8Y4VH_S}&~$mp5X{KlXAD{Q|^j=f(E)9Nv(ReV1*sg8&gY9tDsG5D`%KviA+q&4Gr
zbyo=9pG>Ns5iA7<T&(5sKJWpl)|PA?9YYsCT(nKOMg12Zq0*N++E&vLrOsDsYoOA3
z$;7CCOw5Z>DlhSrQW<$aX-gajVoBXWfQYy}8=ucc!Nu4d8rJ8pn~i2)+nVe-CMd+*
zHH|&BvC}r||IkLSkVWGG=|06!iB{F{pee&K@M@(6+3Jg7Cxbc_T^>mjQXw#u#?O_N
z6(3uaK&!V@5G)o14}_|Q?f+$b@pMc&W?^TibYkc4-_ekipC<Y%#eSPSw*c($O|k&m
z`s=j@X&9>CL9uWctzTPqHsFriL{+0vExBFI3*S8Uf3{9^N?d+4f{9BDq-$GC`EE$Y
zNV66BT$=PTr~fdvKOQnCxC6O;Cdr?4AA^W3pJISK1t7v0-LQ5yo7P>B1Jv2<oP;C6
zHHbl$JUsG_<Z_wiaMepm2m}*~o%w2lg}xj1vT(e(oJEDNdYdsH{QnWe)Sbh3QyYHv
zlme@bJp90*(ATNhvz}Zm9lCLSFf+_ot1kDj2fIp3c>5}jb8tU-m`u-XP_6FS?hr^0
z#b5)&9P8RVM`!#QNGg>s%zR2I4?@PnI!=8cRgR+Gub`54Zh6_Ydfm(gam@oPLW{Cu
z*455M<a}l;YGiIlGohoi<EKB4#OtD|^E!eA3v!0vNVdIbbMUd-{=zSb6ZQJzDua57
z&qM~@pYI1uCk6{*Z#iD6`ZB0>K|1M`CHfMjY`C{$FmK`)Q`<AGv|`Nyz6U<v9pE&f
zhm!PFHw9(JUbMFUp16EVXXjWxgIntfh-C1Vl&HQU=~Br2D?T6dy*eD1iU4*as@Av2
zXPK6f_tChFf__@=i?8;s(?{W_A-Hqwo0U_Vo}#D6uhG8l8ji_&Ov-kb6DLyZNU+1l
zuJxd;#dv3ag!%D%ex8?OBUx?~J}gmVJ=+v%oFKo+&(D@RFRu2VY|kf%jfxN#Ejx%e
zjhW$3f+ti{OKkAIaLk{BAvDE$db=`2+Oc|Dr}(ThqX3t~(z_lwmfiWK@q5I%sEsK-
zySs}W;_Idu4hd4;^aX{TGRAb<Gx{KY1F56qQV+MjMt6pV4;AK`*F;lG^=Edi_66a9
zg$>r0kr4Jo3)FOtSG#ayA<f}b!o1lp3-ldCjX}ad+aM75`4hb#^wo#IKFG@P@MdKG
zfQx#pVuvnlt*OBHdQAHFzNMbVFF*F+d$^DwL_a8@blx!W&)aQ}X+NwNlB&vE)L<{t
z0Ac+#$R$+cZJXtvR8pMr*Y>^OQ$p~4Mk47nH+K<?p6P3-5MAHo0t6DKE^F8@w>!|$
z*TOP6?=iQl^Qni}l5xYJrd3r{4McHb?wI;4&7XqLUE-L1-&HUSk|?#93328O1)%yp
zGS?vl6_#!KD82aP*?cw)*-CA53x<^q$PSBKz`Jhj9@bC`MQ@>MGAThd&$B@qx-RF}
zCbYKe)AqMn$W}EqH4AD}@ip1EZqd?k7}{5A%zFevhpO=4m*x3{8I!5q^y`EO-4OKT
zw3)tik7E4Sv)Px}yjEbGM-VFwRI{^pjrg~DifcWfz1FrXC^)dKz$k_D)-qpSsarl#
z-m5FPx5tWor{<jPq~SReK6%pVw-yPzrmeq~9zl}cLh~|S)?KB~++W$O$S5$Tf9L-q
z6A*1EW)bCKObx{ukYaNODV?XUN(V~l=iA(caxJKFZ|ByY0;L2<r~GbN<-3Rk5qsQe
z;lGj)hOY7Nvp{T>Df)$&du@fEU`+{+A@k3ShTmRb|Cs)_G=W1{03ZeiJ0!d(N?S+V
zu0ZzYlMuN2N+;Cao*&Ta^OVP9O#X&_El6sMInS^J;-kN%*)-l;uJF_||0Hi46C&=(
zXT}#=%p@4)CLE@X%V2x|mIHV(ektoyd4Eav!L(T3K2S5)H*-<yF_xUHe5zLz9%-5}
z(B<A+#(fJl?mz%Bzf}#7swW<c+_$@NfeW{(b%(bAns;uo?S0L92aS}|NN}>*(u$6c
z;TJ_LN$6<g=x14H^f5!IZRX3OF`a@_O*p+GrE>EMT(ApW@z0pZ=dC}jS*oH;<yqRx
zmkcgEYdzPpz7zk+!YY^5SN~2<p|30Ed(@&*EkJpxy&7Bk(V>t}w!!!kU~J^(wiV{i
zGal2yl%}vtZz2_`+KY7Oe>0S06zI5wv|J?N@LYDkVo3&i-bvU!LT0)S3~I_!dsjAV
zP0B!n;Y!p~U+YN3)8=VJcF?Qea(4Vs$bN8~16>nAd2X*%Q>UiQf{URpwvU%VGaAA`
zf8PDPC>{K&S^PPDIQ(;4`ro~!<4$y3e*OiUi2IJu|Eql}!zwh)qRl?_V7G-v7Rstd
zttT14@|xBTny7_d7@R8xmp8I7zG}&c*@pb;K<63F>XLpBVE^lUA}LlwmHFtDJRq=`
z4t7rEX`PPNPm)b9qpNXQ<FPhqqchT6yr!r|P6Cu~F>LIq&Gbg-ba~``h8A-NO(@fd
z=*<#~YgW{$CXY5LS*$<NqS?)-rsxhlQG)PRntH5SgVz5X{2%17S6J3ij1d}n8zBX+
zcMFm4g9}9>oFl&BgN`Z_Fq9EqagSeZB$)Lg(!`^k=NYqM&9-WCwzsj9ft0tcIaQS_
z;b(jH_1L#vts&ciCs&{Q4$}Al;Ss3MP6GNDS5Ee7PqsG4ZlQ;+hi|Tdn93@ul%cpc
z`MYCY>G!p#eCrryk7?-0(NQmD__av_<Y(dBFR>O8{br9M6rst(SE_%p!u(R~Telw7
zdz1k&Lb=V8u_od)Qv97|R*ugL1W5Nf^}!Q{sY85hS_5XZFRdBD-@q+pR$;3TlJk-&
zWs55nq<@?zf8u(>9~!We*PhODXE*Ufeuk<wdxQi8Pzv%XFW3;ile;!^*^>M3mH{qv
zEqtRRqNPjy^>Fgt=_VygP%guH<<ljm(D%e&adV0cdXRC4=wv(hp{cybhyJkxP7=6a
ziaVBt|AAsHv)#Ze*U?bds8WqHw-c(qXzLVbC>y?Xr#$V2b~y2TJnuQ`du79VS}ttz
z#fKF?V@`8dqytEz>;2kfh-IX+>j>QZJyAv52YK?hV@P~5j^AMD8c8lh;&_`>m2?0w
zxMDeQB!6tj$4aM(#Yy+-2M1jA2^t9%<CQ2j3TU}ttUXNIepNJCx&@%o_WV`?y9WWK
zy@EyFl?tO}Yl8~8lE)Y>!yKK^SeF{AauQ0sB;%bj$HfjSRQ!{6ADOZOj=1RAzFz)p
z<A7fh0%xm+`5>}2gM0=${12e^hg~J+our46)`^|2BH0CBItEF~Kr@nxd)hT;Ny#9l
zKJp`4ZHSa0TS@d1FBX?&lDSNeyd)u!Z#bspPeZtDPxK(r)Gu<SEgl(XE=Kv2EgtGG
zRG%$U7Q82(BHDh2^=BLE^o*=4Aw7VsT(&)N`ebKq+V1$^;o+=N#KuHzseP=6nOjZI
zhT(B|rdl8s^A_p5L?1=6;OdZ<u@sNig<iJqhRNu)KoT1|sZ2bUlTK&|RF>n#HAtc`
z>T~+Y&_y`J*AQ2kmNJ3J9ACU{CjOQwFpFw_+mQ7+6<FFfdk+Oka&xoz{k6H1-r2Ql
zUWLNc@h;!6uuQ%3`}}&cWJb|M4P3py{Y4lmzlM5XGwn}elrakH6z4bx`S6Re&_(7a
zHh|1e`D7uJw<l75EiHo-R8WS%`X(e+>bCG`)DfSTx@j#ID3Hp<eomRVI3q~>bntQB
z?7YU%v6#{`Z{-MX5n_W9W*L-j_r;VF9vMgNKOT!=?@5*oZWd65cWS~fo-F8F`i-qr
zOOgP9gp8o|_JVjFSnSE<$<fLF$^P8Q!&W4x{L;b%L%^HZ67tMUN8tXCIKha}(3V;d
zKhDC>PUW7mk0(d$HP(PQEblyYg~VMPJiRhgUyNGc8F8|(adO<;?y&c7lcp-^cz4o&
zO!Wv5!c4s$kI51(;mE8dKV?u62iX+4zd@hh7?NJaybXHguOwiC%u*tpPn#;;B=e&|
z4~`@CJl>A#;@av*l+F0hE<Cwxd@b85MCG3E7e?4YDP4?{rDb5A9xlS)gVNF7j>l#i
ze9qqdyL3#3UgS8<L&`~~?N`^5xU+Le_g<@?EGu*+dk-mX#2!%<?2>4qoz}*KmDs;z
zYVUXdS<kM;p&BW){r~_=O~O6{fFXO!D}SkOk7@e6j2jn7Nlwg~Cc63)>?(Fes~?m+
z<S@&T@fr*6m<M;B=ap#vf!9k6Y=Up3iX5l^w%|tfqrwKFI7OOrC9>xYCpUL6B@c>0
zh}^=<22ygqhs<pk8X1%ZnbW=9xv8{-QZ}y<men!USF$4>JgX-z6)3uBn`<+omtd2~
zswpRTNeLA}`l`*upoyv+oXzx-F~Qr%2<Maa+{CEzylEJcxm&ra9SWBw!L-F&&wcPN
zILA;Ed(H)SVMcWdmjyG?e*wh@7)~ov{p3=8Uu6SaQArNNgP6~-8Bv45!A}W5z1v*O
zzY#Vj>&{t;d-pca_lJI@l8Y9Lp+?Dra<+P&i*5(={vDPE4kZX7vg~QJ&64skO$OVT
zE+;ShtaT?$fi0TnKRK?;?^raQE_;t7(HLr*p7S<GPuBA!)7(096vneh<3e!We5P;0
zW~<~PGEq4kgVUKWWUiWdn2+S9kL_X9k-e$XVDvQc@t<>L*S=h+wGF3jD;zomeB&%R
z2N9nFA=)C>%m#v72D7lj^`E;G+NIzuP`-w@A3yM!v@CODFH4owmv_)Vh^)oR=Sul8
zyT<#7DXVJ0HH3tO9$a{)S&1X%{Y*I3hmQ1L%6k<26zd^TP0eTKvzC%J99Zo2&ZX_k
zz;P5BEcAjH6ZL83$~P9+!!;lM3W-P6b+O|qPWqb!XztfW!|8ydH#K7L=;_5OtO)<I
z-uC*G5@20qEmZc_icn<lWxI7qj4Ab7_(&)<7)g8%ob#Lg3XV~-vI};^$xE`N*tE)&
zPL_n6-YyJy<#&Ir!2eaX9LZI3G(t6jS^D-wK*n>dFW-S<wTMh`_E_1fIO;)!lQ=J1
zu4oJrJ?DSjw)1g&a&Ah0*|4DcXY_YLj)Tz^pUn7}s#ln<ux+W5#<>l<88Q-W$yLDg
ze92FBedo}t^Vqq0F6`teU_h2)45`~_@7&NS{@DYUd&uLOm&Mi*aU#qxw+q$-#s3br
z`O&klkGJ|aBKEANcO1`Y+`n|`QkK&zij$=|yNT`O_LFaj{~x*fVmiyq%hJ~yf=#!j
ztXq*Tac={Ng+^uPULu@r#$*i#>Scw$+1qsnpP^q2%%xON+wG42KE#m#lQDch^aLcT
z_UvfdWZ}%$4~Zai$05rp*Pvpct$ILa^=W*d5-=nSN@XW9Uyb`5M?%hCF`bhY-a}HC
zd^!gQ7LT+mfYD#?uot-L>9d1IsQ=%J6(e6Kg)|Z#*4lY5wSqhFHwR}D9<$iiSLy+j
z?ghTyNds->6<Dvk?@O`@5}92snwg~^uF2~#oKd*I1!qnC&{o5;O{cM}Y-sXDi82!t
zd0Qyey%NW4s!d`{VazdLK7Oz6=TXJp-XMCGU|Q(gcEvzRT}|~a?Qs%dKiEuiSwcQ5
z@yo=|R~v|3{J%$QyfLXp+F!;5{Q`~58e(Va$*f|joiw9sulxBwlFmJx>HmM@BMFg1
z%BgZFYI4XqQi&-MGG|7F!Ynz=`Bcd%hsfCwG3PnX`B2Hk7#Uj;l2|N_mUH;MKHs1J
zT$jrn_I@3n_kBO^;;l)(^eh|Cn$~H0eR}1yYv(NPwkOk&1KcORl%!{t5?YsEhv&*T
z%amkW%~!J*#Fr~-@;ldf#p#lp9Ln-p<_LNBGWCn@a=%_h>#<Q-bnXmT=)neM95T_l
zWjIFM-G-lgxsQpiEsuIt0vyC#IunH+s1wHF<|et;CI)s_3ejnRF>JQh-q`J-QT0VN
zsniPAL~>A=Z&x4)Gp_P;H7d8?xAUwl737b=Ii5^aeI;2ctQn0(iSk5_#1|fTp16A<
z)__^ck)e2)@LDbU+uuvI?>Qjv%MUlE*R)*_(T|hkd<+FNCkZNXX{$eS&$^*Gv-rg-
z09=Ulr#DFNE)UbG;En`3aMr6@blf>^%>G9G)TL-;_?^9<xTaj#zuka44tWK3yk-xP
z)F&{-Wa}4#tm&toNHaG|;|%Ur{1%hFt2Dti-SgCf?ZcAV?N7qFTSciUP3YLC5OUMM
zt@$!gdcLMg@?*Pvu&y{e9U0u()fT+F<0kfVpLyhjPaTI}jpu6>7$u1B##^t1wRilQ
zwPNu&54$&qV(DDV?M1A#UiN(DyqtYozY9`8xT3cYkrz~xXq^hux7@TZo&yK_CI>1b
zw^@y3?=t%GWOtfMnlnTd%N{_%JG?PJCiN`F;vJ|d$5UMT4rRfO<U@><UGrz3a(^pR
z7j04Cew}&Br70#Rg2Q<8Plgz6VQmhWESL)dof-s+6hNR)*;qEO<$2{dNI1_1de3$%
z5wuT>6PDnHGmOA@8d`B01UY1w)*10L1v?SeZC3nJiAuod?2}6Xh48Wv(yjxoOJnw6
zb)L%T7LCa-8fJTQ{utFx_C-{(u|k#^EXElaJD{bVVcWhv*R`)*+uNE4E3FiG%ySnp
z<i-UM#*%kv3#dkn$L=06uGJtk=@}U?8?12nKdL5R@>Q5rlr9SyVkXkZHub2122DfE
zGSfHpSVlV{;S&8PoH_~U8&naz`WF<{Q!YxQ{fSLXcTZcJW?{eLz2D*~w{Re=9OHb6
zu!u<nLhg+4@hnTpn?s2lmaWQ2!f!x>cN?F2R3n`wY~DnoOFb5jkx|7{()|(!9<>dM
zV&Zc9EGbW;LDQ#yTw<J!5y1v>xsW8JP4{Gh=zk8izIe7=nJnVcJOUqvT;g8wj@Zq^
ztq^=VF(Y;ywjXR`Pmd2~SR2C&uodYr7=UJpBC9J()x%%J<z^p_W7W4YtPs1VmSwn1
zOqT{yg=d8n-19~h4w;9Y8>F{q!Y!U9!>nyCIUjc6PE(38RFFPjqce0r2{}d3I?Ag1
zp<$^n$Cj;Hji4np)~d`e!d{!9w;M71SSco*fAuSA5&l<dLOp5URiQ7q>Pv5oCiKM@
zQ5T03&T|$)j+9{Ark&+ooN2nJYPMvS-BD_B@zn?H$e~f5YeA7G_H%|ep0SEz?8ngL
z6oZva+;`p85M~v=Jk@D8`?B~`p$E6`wpH$&;zw`VBKuI+|K7pCBgN{t=#Uzv&)_du
zm>|R|@+?kT2p7_vD#@xNC?DHKsm{1qhG>1yE->a|$;TmLl45#5i`-V!TCwi0@>Gh>
z;ouwwDqrLZs4{lk-v@N`J;wewJ#$ZK_z{i0q5Y`ks@=ca^Wq18YqN-FA+PWtmbtO<
z=x1Q<y{?%Ra9hQXuPN7nTBNCn9%OrR?XPoeIudUB5(3C&A5xqs?dH&3d3)sT=_gTg
zt@_H#B7@(n0aX#qSBduyHfXH`w|E%o3qgz8wDL$J$eSU4eSc}>z*Ya=gOk!hUSBV{
z2<Et9kcoKG;n}Ux9lq*Kv7wiVI7V|-Qi7Ay@%IRyQ&W>iJ!>@k+zLo#9@@1`8^1K3
zA7>#tgqUU?hgtt{a<f5<x}s2Boyy5i9Ip$`l6Jr4OOAEUugunP@e}Aerno<3o|$s<
zV~u7ZYAqtQ?FQ%0>EFyuzjGXS|D;SYsAfj^B3^m3@}DMxdJfZhua|6UE`A-#khhVb
zW_+^wbU=sTe6FkGN%R9%<Fl+pZ3ohSzHL*ej53yS#nd^Tu4-&*nvzg#CiINsL;9%=
zRHG^MbfF0-M)<2BN6a;vTnV)Fta;w|)9N1YEgFJ~#%Ge3#+!{vZIp(o6<v<yQ-;vs
z+H&~9HH)M_y5~;EKVI<2g%AEm;Ry}Xk|*Yxi^Q-*7Z#W(k!C_&Fy*&zM00dB(>wO>
zuK>pENjgHllo8vEQr)@5Sd`9cMg?@7R=KB;#h07c`%dHnsDh6z8qvHCPy<u`sM+S>
zycX{~K8s1}Ch(7>AQS3dN-7L>e$r=oz0z)7>!>!ou)lvTtGJVy?m`Mq8nB_p-Zk3$
zx4>+W6XlhcLL%(X(J+!c3jE5|j~{;KyJ2OV8+YOTPDvHi^OT}y)?S2rZt`>UJ9N>S
zn##W`yUA^pO*<REHx@S*Ya_Rbn~}Q_xmX->7e{W(G=wa=vGcDq=5M}lAfO8#>TqB4
z>VE{z@svrKXi#JCA9FD!Jx5JRXF>~aMIQo+Ol)@AF$28ir!?>CxxvboWT<EVKn8!2
zauvU0?3h+!suXLne*WKR=^@QB3wScH-zh^L^-clw;ce)$=M8JoO4;gmR!NJ~Hw-!X
zPQ;}770?dHpGx(yLQ4(F$<dgcuy$NW2>Ql}jUjY_ipB3qoYk>wKw3;M#+%$0r`R;O
ztL8H896X_*F`U(WP+I5#=etnx$!<NJ8HMh<hK|$=5|Yu~AdW)R1_VlqC@yNUkR@)1
ztLcWRCcmV<xhKoHR^O)(GoJzbcELv$!4A)tH#i`Cueq}9^e>mD8%pUOD)Do0;frt9
zNNpY#7s$w7d=G(V^S{)O1E>6Ym}`9NkLw2uikrN=_*F*4!4X=QvTO^t%eci&*BCzC
zSzZY>8i$3SER9Gm8@b+W9QbchsOMi_?Nfj#F^$|MZi-(AfzF^N(*8zZ&)#C5cNp1S
zywV2k@=>&`@kXoTsGouzpW(oGQ%qgvAZxV0{_a`+)V@=|l_Y-VOrdY3PYoVo*QB6Y
z#3NZZKEJd-pEBDOd63_gKto!3szoUgY|M{xT*#l^`PSuK!z5@*y;-4}hCwHq4*fef
zJT>{?%cSHSe6Ssu%nJIMTBe%P--@eb*K;+&j1Ac#shaN-(Tj#iM6tYKh>PB5=+ryG
z+J(ojkSGgFha~LUg>}@`>o3H^8_j>VZ%$sY+c{?SrJ>BN5nI|V{_!(hQ}x|TutEKR
z7bd50-So`euvVPeUnlk#ZaLxi=jh&~EMjHwB(#=3$O;Zg)~C%eq_QRLCsBSlaOLrt
z$+EL_8pAtK*E^gewenpODIL~|_d3CmBkY<yJtQ)+HdE|@Bc&Bb7k$M{!iw1=V~f0V
zPeKX`3cyj<Xh2d`QZ@WKFgGq@EPt1NTEZ{0aZU#GN~jsXf)~(kLydQy#d-2SxSX$u
zcp<quQ(t%1tG!!1<iwGLIt3Vzr)+7LfIZj2p(*ceBDa7%vzStaTj6^wA8$A<(83Q-
z0E<b8&!#LGb4Rot{Mp|Nl?q`lqM&Z4&I`EbE(V9$8?wJf+g^*@N%K=^`vG*nshEzA
zEBu#J%uT-R-bHp{MnasI1H14txZCKc5Y<qc?rP;tb9(%H=k@pik{#W@O&2y|=wCXk
zg<QYL;02fqM!u0pcu?3DvvB|4+Fq>i{9e{B+O{_IEL`ttdaiFWvU5!7&Z?sZ8?keU
zjPNyM!;D!c)n{bfmf?8Kkr7;Q(Yk%{#j>b+X^E0J%vC<_rBaNDQH9b?%b$GC2%!nT
z<%b_Nsm(>GE1>@2M;h#P`uFOI--hy0#CADm@k!Q~C2|h5=G&*gQ&M91Add^M#knNA
ziQ+n(8La7337r>a(ddUm+fw1OQgM&(EQNvZF*T}c?3dK`{oB8EteXR}CLvq*J&vxs
zO0c0g-a7A#@KdP#b@xpRj^ZiL>Xcveo~bPMEF`@YV>GPenm-Jc)Va=$AFs#U;ENx9
z$#RzvUEn13`U_c$*Ff`xdUUIFp;B4);|@h{`4oj$&CJtg#AcF#IXGAjo?zH}vg;8f
zF0NTC-Fi#W<e7gdn{ia+?DLB%+ULXLWF(Fkp>(e}E=KM!4*u<|ScBu>wAC5wm~ZOm
zj>{a<o8{?UUETV`XQLxiju|17|Ik2hYs$>dJb1$PmwemfqIjJ}`?IA)^EKb#9czkb
z4(qzkJwCxFPlx*smVQ7>M&mp<S@^5f41PX4&Ohu7HecWFCga^d3wgUEn-cuDsSjIc
zR~)~teD_W8Nb;GGmalr1p^>(UWo11_`?&iJ1BO=faol`YH5912cs0kS_*?t^;_nfj
zx9Q5J#30*6rLvFG&t@L-$0stoS9WmXVN;@v0b#c9$foRL#{AEm%YJJ<JNFh*mD|LU
zSfKpA`cG+%|5E6icm;zfT?2a;j^d3CWP(t7x?<V9wqqQ($f2Q9S2Hhy?0genYSf+i
zlEOPEf2U_cXCeRo{FWA`C#_2k86h&=9ISQtGKJsF{LDhTS^mW*9s9eya35pF`nCZF
z)UV!0E%e=)BRyAsF&w{6e?61jUv}!1)e^#UV4~yF-ws)2WAY{LmcW*GN06vQ9|vpW
z4-g6DUY<s6z8~0!?xCTjnt~Fe0V^+r0(bY$qyiSwWqIP*X79={3X1>1%8ID6GnK(m
zZATtW1s#0vd(n|6giM&fY)9jJoP?C`183HsI$&)ObMaSTV93L+q_#{u!$%!BidM3V
zf|(~k>VFpBXlViI-Yfh$iyrZ7OG{gS{S$qr8DRD1g|*ItEnBP%O1obJsy&IyoDvfG
zCg)+RGuAjbb9nf_`;`&2E$o}!;SYA?=F?LDjq<|QeQjwe;@2}QGTk^O{ZZCc*sDey
z?5T!ONbwuqy+(@N^YNdy_dMAlE<=}oWe4Y$p>6v?_23(Tpd26MSpB=@-<CKuw)w1{
zvis7n2$@i=gk7<Fh$1%|lihoY|MGM%b0n&9T=2ZF&kB2(FJ*Dx5TRMBYrv81%gU!K
zt9kY{(`u6o&uV>B_u+|@Dm|m5{;xlA|AmOl<ixt@)yQgunsiCJw&P9k76~X7R2>gM
zt$N-%?jS{9?bSqjJ*L<f!>gG40Y9HF%&o13U8|}&A3eH$T1t4V3`F_iiKU<%8<QaW
z`=jt;n(vr?j}jsHUJFE?iq0QXu=vTZ>r#`fbfcDo6>d0vsY6r6TDSUnDXaC3OJL%N
zl1DpG-12Mi8XB?i<@TG%q`*h_KWg4n5K@y-R9Za6DRDW)x~(%)s`5wIc@X=Sg5L(#
zj?#2rD?%RAq+cc^I!%&QszbVXA<boo`)-wu_%RI)#!UDGBPK=s+P|&dl+tvZ$1~pc
zy1P>M5%=X2FR;h!8$z`%EIu4R^!n1J+k?*&yF#}sS9atVVpp}|CQUrqBwp|Bul}Y7
z-upc}Lf2Q3IjSP?Blpai>sRPzAA`wF-GWx|Si5<%mruCklw1eWDtVs!DLj44{jLQ?
z_K~kD%vYNcRX)S7UoW+sd-I2E7P0NeulyGE)zg5BV*J8nh32DdbK|>nk15^gVusQ5
z{@FaAC*KU?>PZ(37Cn2z*gd@L?tEtVsjK+0R(AX6ge?KTC`@%LSPVAwsVQ|ohrJ(|
z<Y(6<sBstuG2AUhsg)Kw=$<=N%=FJ3u9l#(%1Q)Hyk}bVzzW);?Tv2bUGbipao9VV
zLBH&1V#*5-Zn<90A(iDR$Jwh~i%-Y9>NJt;XwL#G*;J@+y<q=3dJd+YBe(u-lgVpA
zS`H9NRe3)LA)aE?#LIVbC~}$-w9L)cP~PBXiDz<6@_3$%1bmK*!}xe&ksviknr3IH
zMgWos>s`+UVh~GIGb$fFdApkKnfU34yzZFoC@2T{&FD#ocQOs!Yv{3$Kxsk16^M4B
zB3aczf##9x*crqh(Ce$6ulSJ7#*dXnk09DO#RGKh@J13bNTiKJ0Gm$m0KzX#P<M6a
zY-*sR;$--2SI~9?We~s2l5c*dpSZUdE-#Jq$c($25B4)pDA<GVW8quOGR5ovw)Akn
zFyb+e{UyfaCY1d$<%asZ76*y<@QZvIg9$%ZI>4ECnwZrBTO`tULD|Vh!>`Ry<jUOQ
zDiFhPKAP7~mm#{C(Ix%G*VLEY!EndUiC87TZ-%2k3Y7Q|T#$u!RscgV_ltc^)km#w
zu+i?2*^LDKDoQ<nBz;Hr2U}frmT$9iswncn_|^Fw4tB5(ZyGnOSE!myh)=XbOkzL1
ztwDwINWCR#5Q=X-ltZJ$bX|B@oCh=)iUXQjZ!Sz;sOb$A((N5u)OtU*^iws7tXd{m
zRi|dp#R*Y0cBww`1UOIw_oCvs6yoX1CU4@rZfS}<yXl!EO((_~a9bXUmm0&PsGCx7
zA~x`0s`2C*YyMB}N-$6^H;m6rfX!&qyUyHzjYqTVBo?KoZFX~J+EUa6Ja1|rhbw4S
z4cNhKl&OdYFB&`jt72Be^Xc?3t^DyJWXnsHBTo7dn`Wgek5t|kHlK$;UeBC>O_xGE
zFFjAIGElu-EZ6rlCWc!;3SJ@m1aiaXzE!fHdv4@b-~qs<&51j9Ml5xS_pkkD@BwS?
zy2fm0#2%LO$@BU4Gbv@Uioh4R36gCQAwkup%UXqOad)PRO$`ETT=x_KIOAZe=ihZ;
zn0@)S*tz0QnD!%?P(CNxuNT0Tp%F-9n7;MOSAvP2;ekAVp;5HL%7Gze@x9-FtfMPo
z)}E?TIVY=h@(CA5E{|2!rK6I-NzV)9M<LM`La=CA7?0*5HeJKuoH7w-F|Xe-`O?ZU
zzK#d$_YT}njl3f}QzO<l7DmOr76#oy%{|OjiW6S&-xfiI6h$GAgjuG!M@C;)yHBOu
zyydTTovlKec8c1hpmzS#+!arYG&PFdtpDEVu?`-Mb+7QPfua~l!;08pmh7X<lTR;}
zYdTZo-W1~R?NATK6PM_3+If`?FG)b;)Er1<^YeS{axq@e9Hc(2E6_E}mVz*(qU%Hi
zctq}JV#r`4I@sTg+=@In*zb|}Q+%2`$3ygB1(e1Bk0B|*p>|yZJFYsnUPYyRVJ-m;
zB@RJQ%9=#yL(<zNZ{gMZZL0C5Rwuvk3~&3Apz)8%0T8p+xG0XGS?(*0PV(Nr|E1m6
z);`&}B(CfvUt|RAHls|-m|#Yo)3^Itc@0z%UH;{g$e4}1yU@7lxTeC?uG%rN-YY`E
z4W+3-Pe02F@QrCyw;V;kOwUBkQ;!_G!|JOqLb!XQAi!VJYkjY4;^l5LqXz9g(+O<!
z9g(dEzdIP!ex9$ZhAyB*N3Mq*?6vf4#f%o1nu6h0;Tcn(q}{wJ?})V|ycPaDvEdGx
z`#s{b&70zXe29WUZ4-I}PDzWlv(L^ChU*~17^3As_+&>^vPFV}bKR6Y5wrr<UH`S?
z+=5F5m_vwD^%@~m+z`OrrPyBcb>e4)caJw2>u_>7Oj#<FQs&nzK-&<-j&I<|c~*Ku
z2<?8NrX`~b@&lvh_fGW6h~J(1#bu2CxreB8Fc`9B&YonMqR(RHo%tJ!<9rOUAN7Mu
zYXIxg8AzgKp~3Oo(dbb$YM~vcgj)*h{ve3I^(pBJ%3Y(LqWLhVRB+g7U?}6G#LMJv
zs&EJH%Bf>R2t)l6LGwFGwC|lNVlUjy>s0&>>qKK32{H0Sj2}zXz5PE~lF^v4%f?DK
z=PD1~nfDsdpAb1$oPUvSlcScNb7C3H95t#ah7=KHQZOV@Nt4E7R-=zI8BNmbS;<L+
zq}yfZaHDb5QMoRWOV&~b(7x>S30RCE(lb#krl1sis$)#-nI-`P?<xr4D7U@)-Yfqd
zl4%e#Ztqd8_q1Kpoc|)1iWJMqmpnWfh6?y->=(7UX^ndaBY+|vwmV^Ml&B;$+w|WG
z@Jav&I)5BBDErGA-RZe-kTaXzTlX%!9$t_I47JXolTU(ZdCka<xe0p9y{_HGK4KO{
z>zrht=h}mRU+&bgs+~wRgrUeA({(@Eek442CC3>j6Two%yZ_jaIbqE>)^dbyaq1yD
zrXBz|3{z!G#}Oh=85nQ9`3@Ow)Mti0tBD7TvvC5>UPCv=fLchggDIR~E24Ld#*ml(
zZWFWMQg!i}BvTrA@#Z1?_>kzEQoCH^v^q6|$yZlLyld;6{@t!)S*><!v*O}&7UZAc
z2CAor85>Nw<Sg2EM$_RtchM3kzIbrlxzhMX&p#`SM%VLpDbE@SwAc&Q%1KApv*M35
z<?IiW1M>Mk{ZsYTh~egYqJx-Mb<7L~Xi+4>#jwo-R0OS`83+<@UMrVGqTEGk;Rs4_
zyv{?3?Y_AQH_ZOj^}QCNTW3$tEf_L(=ieJaMqupT6S;+S*+$3Tx*Y5k=?&P{6)zXW
zpJKfd%nNit?jmz3%+IfmVlqGUey2&EC}4j7XB@X#cv#To0+TXwefrMga9%e9gC8FK
zZ#hPG03={N;|kBxfTqW9QkEi^SADYG{k}kK@pg>G=Fq-`9;~ju%>XxJ4i7+Oy{z~m
zjypUsG?4i?^5E}d|Cz3cy&ZsHy{@eZOzIm#AUjnXxi>XG-@EuN@ekNy|ILZ-ukLML
z|F;QlT@a@lu`=N7+}hf@l>FvTFpn(}BM`)3NrFraf(L%4=OF4e1YVA}amk@yPKJ!4
z!*4^et==1?Hn$`wA>D9%@@|{XwbLVo*CG&$&}hOD@y;V9b^?C+S^BSx_y5p(!k8~=
zJDQWiYY$xww@6Y#krT~N@H3v_e_)u7(6AE_!-_0oj>ywMf}P9D<Dg+)r?Yuwvia(%
z4XI8!6)G1u<*^$SIsq4Xjk<DjLsA%dd*Q{qi9yk0aL<K7$OL#y=}Z20^UKv|VRP+M
zvpjneQuKDubweos%cCSO&HD1o7B-eVwtp|IzvJ!w*2Sv&)bVgreIo_scq$!UsqPAR
z*T^&@6X>(LhHlkJU)!woKiZ0{V}V_{)hCeg5)$sc*j?(K;|`}H)+elc4)z#&9FcqU
zdmyytVp757S3|J5ZzflwWOc&7^RPOq9i_+4qS2>ZZ1|D`&94t@9cq;Hzvhl~ON8p*
z+6>v$hf0^WKKf%v?^4vH%JUg|Ru~$eE5S5wIL2^uT&u-q7kP*w>q<&;1-q`$SwCyN
zpgJK>nkhvo$Ry#l*38b@E7-m>;r^CLkDbf&j#T2la&G=<N4U=>cmwT46t->gVR+kq
zz&{7|Mh(SP8fU>pI0kGE{!C0!y}Macef3qbP49v^>rtVqgvoDucjqKiqbxqNoqT<u
z)g^C1!l$cYu}d@E+nx#JraPMb@4*wC@O}GYoD%8l?*NS!4Oau+(sK7jJ7fj^QXM3I
zoz&Kh0t4}8q?R{oHdt@HgR%d&H!zYOFiqF+#`r~ljFLj<VwaTJXKhrO4P4+SviSw(
zsvcKQ7ec;V7BY_r3Tr~h-c!mA8GYdue=XYe?<e-wW=A%vC3T~5GJMdFjyK-Ysb`3K
z+<j3e3+2EZor&dBFUt}@=Mn5oUD0PMiM4d?Kb}OZhaW5-OWBJmKblhPx`8<Q6YU=~
zyj)`Z+0=lmLvOvDrQ{TCNjb@-!Yj#VM)+ykn`5I?w-ayen98`ebRori09On?Y-t6_
z{lHLh9VkXsi;r`#tvJ*ESMaX&+Y`@={T5g9tSN1={h_C4uOK^cXN!d$NEy}}-;5Mu
z?ts*dXU$<J7cc0))rD@K-@qsmI5R2!ofm`?Ob4=EMa#%*J`P#`p=`#Jfmd2RD6%aP
z@)#6mvWYXlpVEBjpB8ZW*Yj;q<6SoYC3th)+B*<MR(}GJKedis$^>m~Z8@TEsNV}%
zJLg4wfual4tpF=bRyDa5JOq96HlpWVrZlay`l!kX`7?T@j-E}vcd*m%cAElU0Nd>X
znU*3As8`78&puiQoHJcvYhdUsa~{-@gm3<KfU5NkvN}*b3|yc&98=9=F}QT>k{vEg
z>mD|^0xzuWK<$cH_jI73bIk{1yvY7QKMQ~C$WzDFo#10h-%}+;`+i>MaDT?g{p}A}
zPGQ}cEH_D)xN;A>(r^D|sENw-%2tobW;*N~O)X1seyq7h`7HRfJ)j^9m{vD6GXNRD
z;~Mhuz@@(}EgCbscEkl`&jkO9=W(69wT?Xf^ME_gE1~?iw*eUR2*Q?48pemXfrvJA
zzN>8kp(bcB;t!OA=6Abp4*f(ybHT;YCPTRO)~lGBAL%d4JtlF;i5}qlLuQ-NUQLF|
z6N}Y5f(r;f7D~x+kL71_K*$App7kZ#)(}eC#-h@7oH6nAN>=F%(Q8+)Vlh3t`yI@M
zVyz%4sr1&HUDIW0oTDPT78!DY;n$Ak+}SG;-o+ig1~G6KJ()v23^5!)+U<6O`Ad<W
zGJX+rmg_Qu->kKym6A3PjJ(t*jji?*!6in`9EM$hN9XX#=U`E}C{{E73>)9%iVtXc
z{NoxMGo=Dbt~}czHOCU=qtD9Cd03+R>~9(P*|v8d3^VqBgPnOuAPg+|n-kXe)Epk>
zlcH}-c(A|Zvl)GP5nCz1^9f9>IO_&9E8Qco^`zHZ-==UA(39Rc`S0+y68ZY)w%3p1
zPx2W+WsUXFZH8OzZQ&xh6vyBBYRvX(Xzma0%0Jay7B^Fe?>b(*Z<T+iCU>9Z>9sAK
zSd8;5Iv=0pXH5N@#Cu_Tez#{o>)>x8$my-EQ@t@|*0nQQNOJ|=&6F~7MY<uIFalBp
zH^jlC`KEBwvm3hgl&`!iD=RNLdPx+!g6Ho;3h&Zbp2j?R6oV_=!4>W)vb;8Kej?^V
zZ}%6-FL!+Q3JtpI(`DFZ*=U{hr0s8EmnK6$HC|$K5`%k32H4knP>G1PWgO~R%qJ^(
zyxAd$pup!X)u?h@ZB67kU6)jDP~!%sU-RVo_Uc>>r=)lC{8+4;NSS`iPZYI#cX_2s
zqjs|SXw66|i3r}f{lt%Qo*&_{#ogJOG=$+SI=1)NORUeJX5K|+Od=|eJr%|!C8@-(
zN!yIoGx4zfK5@Gcd3%$0vwxmk3-;QHsw}eh)vuADcI&%a=hw(>WfP%u^O&iXosJHn
znO%$ub&@xHmKUFifs>n=$|)|=mG5RP9sqpB4`8|6)=Nm~ccJ1@vxjF1Y6T+?yZD5W
z6Un))3d|;w`M~6~n1VpAQ=&0oTR)^ZtU9(gQ?yLxr&r`9pf@SOf!FS(OIB5mrvKUY
zH)+!{+MQng(>Fc32`d+Cjrf`G2mLcWTr|teR92)v1!3Ue0)?!`4Lv1n+EJ+xdd+OZ
zn`-=zYJMvjP?8iIWq`NP|1tgb>(8Vzy4HgN+V%vo!@0-hB+{#5-T&Ir7kGtI|8B&6
z1hCDmOkcpcLd;k~*qY?jaBJ;`5M#4!5U<L_dT@)VS940h^cBNtMB5j`K_ZIt(Q&yP
zj)o%rKBpA%R?nIg{094MtBM|#ri0)r<`0?t>UWZ7O&h59*vwz1>PL_By}uKEZ7V{!
zS}7{qkEFSPp=78PoAXlA8V%l1_0<l`em~7W@yR*SjF-?LH<=K{m64=$vp?|lI5%u8
z+hhHVjZ4@n%);}qff4~k=^}UjLTRUf>>ezhUDc+%r{e_Xm0$4=$8{B?vrf6Fx?1p8
zHf~~GVEdnsUZ$bHmh{?78GK@mYCQHZ5xTsE#l7f`f|d@R5|z?ZgKj@fKZcdo&N`>B
zd)yP+cFtbyR*VaxrjanZeYvkS6T@l7cVfpge_972s>x-az$KtXgBS#gxuZCW#wYz(
zfYf-f%6l-X%9GY}Eb$UHy7f~|i#eaH)XN*OY}b#AA;JyUw}(z-R<dr;uFF1cJ$%gX
zFs`!%gqM_!%b06nTE<M*>XnY^h{&Mr<<fera=#$x8tiJx#IBcM1&uPb&7|uQr;tDV
zYut<*bCS>|{_ii`PvVjEHwIe)b!8IkH}i}y)l%Si*4FlX|K|1myURNZ#4<ah+o1H>
zO;)<>SGE4jr<TH#qz+Go{xxk!$P+o7(y`rXaF~1%=-neaneUeyFxaVi9FlFzjA)Mk
zU}bi<uzp4D++@kw9bZQlLu*BbsaP4x8LOi0qC53`R24-J{|r$jrPjCNgLnEPWdygE
zj-nhg)Sawnh2AxboK3B4Nv#osjjz4t%4rVt-}QJSdW^-Ew!O9GCaSp1sS@CK=`s?j
z;99-;s6~7d*LC$&d$%bxQadsZYRUxrl`m=jvh_*a3BL44PAi1<8!%eeC7@)br!h%(
zDQpDM2W<$2T!jl#q+n-{$<>LD7SeX?u44Z@=+VNYb{?HkWMX`m+ZTUtTr$oQWL_Qy
zOggBW+Q<%HK<{s$`X?Q2><m-t3}NP8Oqs84Npk+ok`3n);Q#pOGae<RTdqlw1?W{9
zMRK;E=IIfpw*ABF;rnu|h%dn)lsUC|MM}Fcg}g9^H=be_yj{9DXpn6VCXNLUU>~Kg
z(39{aO|}bdB(lJ~XZ4}aY06>qz%U&cPE}^M6S7a13i&P&gH;L7f#cpq?*0I@;+19W
zgUMb^T5dB+w-HFhWx)kI-HL5d2IOW)aTNE-iVVwjjp4@j@Ra`J5Wc}j)V#i0B3Iry
zmMX8}7+_>9Ub+`o{;rksh%FF>_Zx8i7&o8^&=$0=w7qnGF37cLAb9iW0ljZA*m-fy
z_eP1GX6Z*f=*i-lNz8)#CL%fJ79YECgQOWtKre3mY2Dlz&l=fEc%p=#yrvOHdpQYs
z$gXB$!Z~9192X4GMjL5vwu`}(hI7=e{`Q+)mAO?46^u}bf#%Y=o5aRrbpOes6r561
zjwj#d?p|g<=+#Sx@|}+wF>p_LH_IS4k-%vtzpUTIEI9lQKE2rkfEcbtGq+3PuOmc`
zRyyUkDVH&WU)_7&<RM`xspFc7Hp~BLNl_H_us5tgOaz~F5oD3h;Ls(j$G9gc5iFy_
zA4h2$FFy=?Z&USBseKnF<(4iY<s<`9h8Ya0$xxbZewM2K5H-*?K4=eU@qRv_OX{!p
z!Jk^x)c&${Pk>{nf<QSzSYqt&9Vyldv3CY9Yncl9uRb-u1s-!-{kqe|8MXs@ZRvJ!
z6Z&NnrYaq?7<R3eu4O#0Z7NNB|2>QeOx)N1&GFXGc1!Qcfd0)O`p2!jz3C<Y_@(>#
zdVJ=o`sK<HOI%bFc0wx;DZuhDoTCWUj$7HqZ7KlA>qZsNS!!BadS@~@_|z>IVWAzM
zM(!+-$x|)&dLo!u@tAKKyAMdV(KRYTjX>}s_M6`GO#-j>zZ?Sb-d=INRizx!Q|;oM
zE!T)NE$#V5EmV9&bEIyNdgQ>!;2?U0tyS&qU~XH&1^?Tg`Mmk)=1*G!RaGV~>NKuv
z2dOpFJK`^>%@<Cz`Civhi=TORSMBD7>shxeY48ui5ZLhe_R@SGD-_pX7K<*3T)sAu
z`N9GzkN^5B;0mq1h`Q7dW+$|4b7&{FoaqE};jZyfq6DDCa<Sv0PrY+*Lb8KFAZV%N
z3%>HXGtLMxwq0`T?IrFG=dyF>GorK}cHqL-J-ZTVg0IwE<G;5@?EG8%T^DHLcY;l)
zlQ+7=-V<t>P1}aI34F1mllRU-AWuksz{NkvwS#k6^gtLm39}dHdm$kP70FqCQ~0R6
zzH{(hXFe91`dUHvt+pRS8xxO1{X%!Y_Yr$nPOBUcd;a}*^^dGHTs@QwP0oTY_+Kle
zNH4-&5@XV0EYPSb*BuOT+ka<jc0Y!LGS@leK%E%e?S0qrHiO!FtQW`FEh{^3-r%Rl
zkt7oo?*t8YILXf*h4nxJreQkbRH_<S!iCiuDC|`y>Od*Bt;-#y=c7$Yp<yApSZP{V
z(k52svI`hUK)zMV9+FU=%!J;#!E6az)tg15t|%Vm52gW^IPr7yO2n)d&EFT7a;~wQ
z!~G{Cv534qL|d*t9%biSfp!q8;W8#4>tKnGx<j(9vYu*&(uAor)K{4rR=y`Ps;)ob
zic>!^brdZ{BL#h`JQpE=bZ7`t&MI4ap)dQs9BfNFRy*G$uKbE*#vheNC9AXCXl<Os
zb3cGf0MkUVn7OcPSFc@5o-bF>MNO(gZ@w_(kPNE*BX_#=0>3J)B-r103f#3e9E4oP
zuLdT5vYiUZSCh$tE3*o!`8<i6IWs<RR6>dk!qhtZT3IE_!*NW&{0{V$BpoApQOe93
z#zD!kGfG_8@wT`nTJgj3`LUQgW_v~0WRW)=|K!<xI_l4Iq}%B@(MhS?If$cH7Y^t9
z#pzHv=0PN0k~SpQv%TV7gO~FR(JL=@FCJzjm4rZe>Zw;tZ;ZKd#X~s#CI~JLL{$)q
zZ=BRo%&JefF#@rAvEQ6OyBkXqqhRCgG~KgrkqFBMgSf}xHcta<9lLEQ>$Q=2s*}ci
zun&BA)X}G~Xts=;ble$T>PGlf3yR?&0dZ!NOs5&Yg6PUG`HIkjIp4M3H6mRP+w6@6
z1$lvi_ad2qLEJTcntgqQMR#s&YjL5<y71cU(QkB(TF`PJruX3XfRu9G^D15lOH+U!
zNyj?r0&CRkkQ%3prFr+~&-)a9!;kSlG6=t0c1coMD(x6IFlHYa=Ixp+v!HgPs#dod
zpSgya2Q!sI)-l6viPSaehJO|hGB`_SuRar9i)$TEFf3s|$9F-x@QgUAzYHVrjPDEm
zqacg$lYB`=H772!bI3gLizYo!$F<)l0lEOUsiPZn$iF2tn|Uzm#9+(7>EPDvYR179
zo#6@`T2kO>6}vH<M<s<mtWgLZ`41#$J{5!IaX6UdD^<IC_1f3j$Q$K)LYOzrC{KR9
zpNZ12%;nnsJmPi(VIp|8d3xe-(Hd<ZNJQlA4{|7~B%G|ib4);Fm{YEeFB1d83#^~>
z={vuttr=|Jm|5TjJb*zM89v_)>ZX7p8-Po8sNq!3Z)J-2W)47xC6?E)ln9OG6Z{^b
zRY-OF^w~;padG28&n(@y?@E6N{?;YZ%0@WHwbDOSM#Rtd-@{WH(2V&t;XB8lQh%lF
zMR*Splv9)lttIJ+KCGkaNq!21zgw$(F5HrDw_W37oZUMzKPIeRHug!9iUKqZ>Sp-$
zrlj+#nl!FrV@E^VS0Rb`sJE6GrvrpzupXIi$xz?Q+x0)CLK-xRJaX%18DImafu0jA
zU2EM;o~WD*Hj3rbzdPR93_>~mrqn4~utCjaa!HXfK~&!rFp&WcLgAu-dXe)3Dc>0!
zz4D8QF}ZGa?(yf905ICe;G5~84_Df8G%&10j0aQj@{sbwb(}YhtT~LQ!0j7g5j;4M
z@}-!Ohf>S!pworrsokf;KXTO$aZNid;^^Jmi*$fO5<bhSZJ%}n_Amv1a$#4hc=MVn
z54G}_wQ+CAjCYH}?)TsNQ)tnXUiqjgE67*1ev;^$SU40E-}KJCl}m^t2kM@`v6ESf
z@qdKK@l!A;e<KgeGlI)@`cN7iva)?;SZnSul__>zObFmn$kMla*`X8FH*8@)s_`UI
zRVjq;7xzurvO6W|Y;omj?z(b+6)(85x##xZx%t9_m08Em*3i8j`Kwo6>hQ&BK6@LJ
zp?j|J_LsZD5JN{J1!x~QsF1<EBa5g#GVc}^7NngI1PuF-0ZQduM%*GFFdXhL>Qe(z
zQ_Xr)8`Gn_j+{czS(Mnq-=#jE4A@-n@K&TTQy4w9@I+}TWlG4b2z~hEUv5%cxA;!8
zAL)n?2FVAp;jo0btB>SCqV6!m4i<<Fi`X?h1v9n)rit@=WxI_1oxQn@#c3+g%-0ux
zFBth^$GtE<0BwP)15R-zie&`^VGGf}eX70uyPc;=>T`MFNiWCZ!^^>SE%I+xAMPq_
z1?RKqeD(ZD!Ahq2D0qFjNV#^Xl}*s;-`C4aE%m6L2vO-`*~>XOwLJ+VvL9=nKH4-9
zU`rqV=f%y%b;&|0B+{(M4Rjov0-{bF9d2~;TvTPc(X*yfe|w&>a}Y$<i71fjHJ5GF
zx&F%#hfHkqC82+663V^E-yF#75FgTWQKz>Qd}*feS!%n&(HCrtRD)~)o0DYzU48u_
zcly!lOsKa}g@ch~G-6QHr3T$I8{4b|!>w>Yu|F}4kW*!;vX3rx1^@1N6nWXe(j<Bq
zT(1!d{R>Ngr<rgy;$Z(O1oRd3{#HHBUm?y;|D|n`wFVgPh6={V+q-W?F&9n&90Z6+
za2jv*DR%F7^z4GJ>DfRBL>8m_;I}tOfXx5D509kRqe8%9`qz}X^~lYjt_kT5gY7lu
zE~ly4$yXq*#J`=Yo#_tWeb6#x46Fj=pd7fYW@&8yEoV>;RgD)Y`sj%NMQq4~^Hnf*
z)tj1Dg2uE>kqBfB3oGO$zPFC~ZUF&*(Dr^u_2w5Xa8;jbj!U~Q3%y!cg3p9UQ~N?^
zRcrjL<cFBrrcBIzDJeBsei0M^PZF}T1z3Y<^X-7uWlGjo#GgETXgTVUZKiLV98p%n
z*gd`avA#pl1BJFB@;}!U2P$hht~#LTP?TL2FBqvLPS`+QjA<QM&bAUnn(TZB4o$Iq
zu*aulR0Bw&z&EIX_nlHF2%BY-Gmha7wRj9xx;7G`N5}07nLj9KdH7%VvwCr0iw2VA
zZbUYA+qVrvqNS^P=J*Y0+1-Iwd0nNkLf1~zL~|&~^rgC^Kn(`{3jgSYkct>`5)1sA
z%B4TRrQx<4l=LON;h5J7`yw!IKj7B4S=2vlu66U6aUID5H)C&_%yC>=Z#?icBzgWL
z;<<F$Q(cm<Gb#Qewe=?K&%1kvW3^-?^$hJb@ClCoe9mtgMJIpCOSyj3tTLB=9>wv<
zpBuRX{0j%n$er29y}a4Zu)UC<p>C;kIRhg~gM!vAOC`Sn4{$JCXdp2~=e39X=Y38A
zFV6&940n{oQ-vTtSGIFgs;KMSSN4>CJz_li4~a+;e#ZV|xl&)k<((EiAGZL5JbCx>
z_zf4#Mn;IZFG*}3a}*{MG#juo%KPq$uHfsYn@$|>uF$L3*h&!Bb-&pqo0|5MIfdh^
z>u;XT=Dgh3ppZrO*je3Y-gEp4ni{uKynBMP3=v9&rb4QJR}>@Hepl5d;U(Y~FIn8E
zv)`<ms!j7VjEbWCQ%wwYl(@R7n~?Q+F*Yv=67hbs*sC)_kpG4%#gm_0)PJCl@j<<h
ztbBGb&->+%iH_uO+%o^ygc)^}BBX6c1lpl0RfnaoD#~kTAh=ySZ2R)ayt*nYzcZVq
zWFyBtHv_>=)6&IsmSJuAsB}aAC@>-(E*+2PIcf~t7hHalx^ubaH4{=}qdYN*uupvh
z4GIp_=goj7pt;!eRk4g&L8%<2^NAv-Rg+Rozt~+dK5ntSm{%LQ_99S8A!U=duqO;W
ztG&6t$0tYGX{U?2eL(NguRy(7kh9p|noogZu^#!B9MxZ^XVg%{jQ*uG__1r3X(Sr(
zg%ox%htlAxF{&yBabPvgnqLz%SCC%UJ4iX$7e83tofXmB+`PBHP<XJtw=2%*(Y{0O
z#Ii{klMg{aNrX^|^JmbWoNo-Ro%#dd2(<Ju711?M_m$jlBV~Yox-r8D2+g47lW6FC
zKcE})u@=W8VRz(yg|~KfO`OkU<7!lDBB;>>pYj%Ub#;lv6j_m^73m1BGQo?D?Qg<X
zHKld+400b|A2E0H(0$W?G4;m#Rsz<esU@4%#^?Zk4`G*dJOgy8WEU5R7I}0uOY7OS
zG||W6z$k92c_tQz%yP4(fKoWa^_hW1rT3Vl0{%txSQZu+&@>mewfgSJEZ%`*czPwj
zSSkC<OPOX-zJjbi`^H(u#%?$$iL@~qw(wPe-MmiBgLIzcH`A)YV+CWg`OIuG-NxeV
z8Pm)oTr3zrO0Lf-b3;kC(;n_H4ize#|Lzq@R3GSUqlrM$$!3NoP(^w!yvbh2ZQ|7-
z$cd2)@1Zg@46md7Dui#o^6#ARn~lFDJJh~s4ZU6V*sSnzm#^X*apnKk5Jx3Mr50{k
z<3KL@R&Ws~Ned?hw>*qO|2fb1^t>vxFP)8FPUNN>@RIU<l0c*Ek<xsY^iUd_!@p~Q
zo@{I*4C5IYiTx4KRKd-qjsUECM8i!;VL|f2_S%vu2s}(rudGZvOg-n*C2ut&I~&z#
z9{CFM0QA|8^KhX3H;^xQ=(a!cUh~jV(Q`)Zsw$$N4xdkaszd1t-W;aRhLO=LVCoa&
zFs16sX~t<||M8rJO_hq-Gi%!?5HTaAeh@pD-QV&)plnXp(Vys;w++~2INH(H%};fj
z`kXG?CWHHL_g9e1S_ijo@kuO;1Y3Iy?bCwx-fHhGsqBs4Nwy?_tA7yd%db<JgG`b>
zntFoWDNdS|0&;i%HW(4%tzQ;$s~`fp`nvV@!SGwI$%v{n=@xqk<R^zY@q~(i<CSJ(
z6-5q+d(@9I{WpjcrwuCY>$vJW{~W<b45+O$gD(W{g*M}#a6n!@2y6Ig<z0Z547nXe
zU$&6^IMQ=#y|~)a5?9q|H@Lm!ubJO!wO3>PC8d0q2S^7u04=O?L@AU8S^3X0cFzVD
zB^V`^`{455n5fRIZh1|s2bSDbR7)5<*|nVBfp&17-Z64XMBg<!U8aOHV>BvZ(GGg6
zbkUUl^DaXTa-`2dWLL--p#dw<-V~^bpalm|q;j!%Ky2l~Tj>S#T<6y7A-LyzPg+6l
zn{==3?vX-kh31&SFxAp((|&MP&F-`9AA-8kzdrhz*KGM~>Az~^{UB|y4D8w=?|9^7
z_3W*5FuHfAz;X^Z)Rb0Wg`mi2hZNYn+uwKUMMWnAIzTTePcI+!KkdH^Pz%e-UdWc|
z!=mH-kDq+%lIqbpE(`RaxX-F6Aj`9Ud<kD^FQQMSkwvZfCEkO&&(P4uA#k{@M(V=q
zlKqroj>QTYRv(FSGs`y`k1lck**iC?J}l!>gMojS*Np=v@wVBOeveL!;&RH&U7`12
zwJ|AUdPnYpo5G?-I7pQB@)mJcddq@lIzyh-cU0M-DZ{r%KnwEkXWSrxP(u+$I-tco
zKMFS_dqN>U!sjs~P<jq70@f$>N@-pdo868(`ySq4z+>zpXUWbuEz+BctVqDzt$O9j
zdP3|3n*HN1%=%2#i90A_=_P9!J-Vpn<09FTW|aRIEHx4dTq=<F)xlt91WrESHaTzF
zfiq8kT!VH1Y+~m*9DNe&h7<eq>_RDwBI<&Kf;(7M#aFf$aiZ2(bH#~|KkZD`Urohz
zd-xwiAfFifl140h0Uq1T)W7D0$R{YovhShj{IO&MxxVboVAiO#<E&W-3fhFrU$)Pd
zhVdb3J&P*;{4%{^k=y<4aB~jzvcv<(V~U7UxrM#6Yr{#^pL(Wk7Y9rXp%PN1XT%E&
z&Yb!9GVOd^od<#(ghUy7*uFW&t0PfrVf5)timfL<gdDA-Bl~QqO>hy>O3wuk5pN4{
z5>3>CifvgTB*q6;&}{9+WC}>8n!hcu(W^Xq!U=Lacz?cUpMJ2>vsVkuXc~cm5Jg^%
zj&6FcdJyO^Cj`-+s>u+x=X+zxOPvbDEJeW63EL&s>dn8jP;l&08(LpmBKPbOBM-p0
zp{74neflxl>0@~>+uKVjhpxEX>+TL+`%bWBRfbeB_J&c^fB@TDI(J`x%S^Zh;eHJ-
z2gaWA>tdD79tClbH^-KPkWJ041}xF?e$M9dl`{Sp70Vmu!F9>cI##M771P|$uytd~
z@qBu0!qR3aQTRM1X!=vUm%69q*MI)kzu;IPCCibL6@?*ymSdWt+5hkz6ycV}@%l_X
z+!ygcCztk|WeNuxOl|HXsM)f5BwGk??6*Xpm~08Q$jiTK&>3!>iBQ7cew|p1+D&;Q
zlBV<vI&aeYp$`$liA8%6Sw?1wk83(-#s5CK_+I>agg}g)(VfEs8@=5>lpPl_M~}uT
zy;kd(j*4Ncw*U~$uV0s>sU27=z{Z~2-+zIXABR(0m(#x5y!qNEq}ZO9cL?Hs<tKQW
zB#JSr!RmeK|Ix>SKQnYbw2d}9BzT<3iqQBK7>WziK0^zJN^gcohK6eVijld@0JqF}
zH|>1VRGeCLno)(Da7^uN_}<1!-t;2_&3qFflr7fdCYaT_)@(0NTa%k`NVX@~6^liQ
z`*1^Wv|96x_tAgSykgNT80Mb)`p*;Mg0EISh)8=%>8^@uRU^+#^BSF?R}=2Byls$+
z9$!DLLPLx*FD1Lw{ObB|vKctLCm&tchSh=S^!wu}#0A!@m2HaPH$ihlC26Mz;+>1D
z>c&p|hho~a7<n=_x}I0~***w^)E4wS?AK2BI0AWE@y((=ySQwlq+F#xD0mvy1oOm-
zbn#%~S#{azM5RSc{^k=7eqRM89KQ1PY|N+)vbN-wMS0Q~#&V;gr-@+WYX$F^q5I-S
zlF}Vvz|D7_5k9-ArE)jl9tsAK7_k7osb<miy!#kzeB4fvcV2)k2&WC`ALdoUy5+EZ
z5^W?{zR*CJ&#L311E~*^K`|i}9jN#lkWvu7qyZbGk;2edKv+!KVWWkCcq#5yy4@fD
z95vA~QKM*y_i8m@8wJorsPv-U#fQimwS=i_zj%kxXUYVuaT7L3)t=ihL1NX6B*3HH
zPu~<bf5M8Wd|%pZaEG65Q2HV^I!;dD?0CbI%c<vZmqq`OlwI6(AbJKccp{5Y)LV%m
z$l%0TiD#n0rbD3oa)0(<cE1;JYo>U*I16%EI+X}l9wy4jdug&yX>lCpjQc2%DMk0q
zr3UBzrz@?3t(n-{8%|nhmhsLngoOqkT^+VwK{EIyN>xYv^08}6b8Ef*DMani*}v1j
zr|xx!?k*;K>9c%*!y^D=?%yhC4*TcUdFo#0nfCnc{f<qfqi$E_c^D*{qejgbIUBKF
zeEO7Ak=;vQ_aYDO`vN9T+#DCqA6iRT85_;XRZ)4N6qNrqo}-bE#mmWMZ{@<|nTlOQ
z&Nknbyh2(CmDRvq-MQcNn*kro*fS^lCl0$6^pvXN2=WWZFQ*`N@sbDq_49tr=ht5`
zo?%fx1I-O{utf4+7CI;C%ZGfpt<5MOKUZAfwICws_S8yx`eUk@g795w2PM7>+rHoz
zkSy{mDCJU#5-yorlu?zW0pSjGYAgkf*>`g3!4p~k{HnT$zeA3T-rY-nDad;kf`4qt
zDcF+Ynsl@Vz(4eg(T0dmxueZV^|C^aG-6N%nV3}zYNwu*mASaxlwui`t(uA}1Njl-
zrardsD=R4Ib0m*sG}gg<NIJ_7+&IZ4ayaI8Cjsr?0-XqDbPiX@5e=h&16S<X%<Hoa
zW#m>IDD<l+1xnJS)t4MX4s+q}Jg+#-Z9LpGkg8_<Cb!92(Q)s4*6i=VLlURJU1;f}
ze$?HbezZf3!!4LklI8-|Bgm+@0Vv#Ky$v5)n^6F6(czp0xF-{uvl}%KZ1$pCvQkC)
z9ep3j>egQyqm|v6UHQ}5qaOs23$28TJMJGvjAAf;uvVN|wA%6LoP!-|&wBVKc((a+
zJ#u|X^7zt|JWqfr`6L^Jg4Tyg8L{Uef#-w%=-3hdY%ITgg3W`1#V-<k&J}Pq{=s(V
z&r6M6go_~wA}pQ@c2)IF`nOJ+#SfB~vW+M|x$>`+2xrg@4So#lj-szJUpt?)d!p_T
z6X&P?@KPM02{~M3Wj2%uzF2&AT(d>rwNKb+U}y{tj`;*(RN78AH7gK9C@U+otaWp!
z&A$r;_=t?tneFk=xC=sfcJB|^R&4h7c3h0RtQyw6Mh#%S=PxJ7v}JMS9cfAU2>h|q
z4^Bp5UfldrLrgQWDte+9lI+Sg!~b4;q-@?CVePzVCk#7G$fY6@l&7=Xo<gYU<KJS#
z+H?{<3eG!UifR6)063*%w;OZ?)bFwI3kcVd3@3yI|6GfEY_-D6p<*?crozt^h0f$-
z;U1*i<$}DqEc8^GUEW&ON#Mab7l`a5)?X<7V$R;(?%n=_)fCWC{zts)n3#=g?};aB
z?6nW<QC;dTpuIWMZmgZ8ssDCoVV@YA!Z3>O*<RQ1cAVYY>IKI&vT72v-)?^GZ{Lw>
zjBmX3LkQm-+SZrk(-rl@yE_<p+Jx+pLjjD0wfQpfh^>A9kfzMY?G=rle-kahFtWeD
z+v7-ZYdPwg2wE_2j|=zhMNl_q4nv#<@IU0-Y{{bLSAMax4C_j|vb>YO7mKXWPjLx4
zGwQMV>@DQOd$tSjmx<=}or_*5l<LRk-FA~hk-9n#M~Bf*%de=UA&syZzR1ZYsrTyI
z(a1uaKIdv1v9}PLloz*V$EoqWKV#+SNK2pUnL+}wg3_RG2eIH6lw{j$$av9cJf#|^
zEPqi#EB%x;PT*<e-cDaf?W_X<_?TkL9>Im8ix@l85u!A7eEWh4bc8JU^yyuF{W;GY
z6Av~QUI+n<32;&%elr+lKDttjXE&-#6X$J^O4!Jd5cu0*HzgRD_k%CG`>W_!{v&UY
zH-_JV-aJ>`=FSmf;<0$Y=5Q)dUe|lyBK%|tq6!(@&%tOSEO_hB0+TjRJ_`Y0)vsDf
z^JMaI$P6gSI2M~e);VO#dy8%L9j^l~Yc5S0AVzP_wa@ZMUM>ay<)9pJR5PU=m&8l{
zMG5783wO_x>4U{!eMI|#d%JADO<Bw((MHN32T-NUJZ|U$W-VSv5=K=e1@jMpdN4gu
zMO=i+MN~p;8F|>NDJ}|qX*b_rh@Z5jLtGs*?JDaZIJg)>x191V*QN!m*;FC*HyLR0
zH|KM`{*R+`k7xRQ|M-Y<tgW1rSW$C6%rQw!9}#NK3y~<6<C62C63Z!-L(Y*y4x60M
zl*62d#-!zx!>}9*IrF=}KmA*JJj&kh`@XK%>-oHYt&07f0?9QY_VdCB4U^}Z1F^y;
zu!R^6=<s$GE&~)}ve)qAHv*Qye+^|7k|n|$8bI3@ZH&;<In4tw_(QgDB$@O5|4lUc
z{rSfF`oRrf1ycL_mvu=(*ayxdH{Xd}y$|?<sTS`z49M6RSN#)%Ddv=Ff&vJpE8K7_
zNkxtPZB&pTW?L-j;m8WnPXpD#)ZwP`M&v;=qu$3hMSQM3(Fd$=-Do3d;A|=!dF*=f
zhusp|JnpP%B{*xsKwnV6vO>J&7{rRRZx>#CIqVH9kVdV`hC}cHqS0@r8s!VX^RU20
zDA|Rq&V;X%7#l!H%7`!_KiQhv>V>0b8a~Y{7L<9i!V+7r@P4GfopDa2IMJSls#v|O
zE3X3lq3-VybT)7Ek4&fIPp;fr%ccrpPeTgpd$fsokqn%^nE}yfU4%{EOjP$W9=<!J
zp!7yC;t+u3wvMK+?eC2f`60yIo{J9Igzro)K|76oGsuoIs&KNJ5gi16XDC<0f(|xm
z{G}k?Kc;-(M)8|(i?(c7ZQ0v6?0n5A7tYd}`#Zd<nIehq5(0^*4%QpG>bM@t>+^?-
zou+Kl-s)Nt*MfGoktStst^ICNwfO;dGwR7zNpI1pN^MQy-@VDck!ejHp5JSh-3j|s
zj|Bo~KI4m(B48aeS9tcHZJ_*Y?sLF?<g4DiKlj!N%<i_?+|fK$-(IoF%V%rJEukQF
ziZe_hDlrjs*ep)<WtyUA1M*S#*M~a?Cg^qolF3Ef^xUVo$4u;ZjyO)gDLyvyFf?WV
zx8YQN=`dZQ!%tl+mhRYnhh?_MQA`)@pQfTH_1S(Q4NQjv0!8SJec23M+D7n0s5b`o
zMM!5A(j;1<KoU5gWvp*JQ<ysC5c-IgkG;Z7&NGX(ISp}!bC7G?_Wt$L(i)D1r4P&?
zW5KDam$|QDT-1i$FI}z|iD8iboOTS_T-pRDAaMHRbY^-s1-84E4a_+3@Seh8ix~CH
z>%`UZ<?G243i--JEtY=zh^Q9brhCU*aMWLLcOvkfHPuOh^fcImP(dEW04RF;R*^9a
z6QfZ{mWVCnh3T*%EG-$umL8zM+2K<c^DvKMOnIV}sjpjYGq_^oDz5!-xvlMhE|;~L
zqVV9w7JgQEWh?Nzr_^Pg;1)v#q8BTZBljC#Cg<xHA$FtNJH1z#Gvyw_>gH%JnVr3t
z!B70ec+>Y~Uv)aD1}ZD!)_BjlrdbGt=eeZ5H|^HbM#qS;G{x;=@4w_Yfxm6F8!~vf
z>aBk5+-r4uwM(oRlvwB7wCH{Z)mQRo!9IrDNMJYHJ2)ud+m(0C0TD62*@}0uu`&-7
zm5$+ks*_UDHpwF{YtMX^>ct&5id<sGmrqJsgY&6EK4`^|7E0WHkRm@syMUKy`bgNz
z2s2@rV;|z{)2soK5Z#Jp-d|nhHNx#qM=Ck*#1ppc)xdL|g;R$?T8v9w0!CY4pnl9A
z0N8fk_LfWUep=@(bHjs*SMactFs|F^5yZ>%%LO&Mp-edHc!m=AEsU6OBidzY-4mHY
zE;mpCS4QfXU4KL=Lmkp^%ny3h3&4P2pwmgHbWF^9Ie|+|$gxRODUjZ@RYaOVo`i^^
zKM3jAQw2pw)5F9ZWyI-h5Za`B?3M_$+sd{=@xp0N4+f0yS--X@O`hY9c9#I3Y}Mys
zOmya%R!Df2XSM;n$^c<)h2-Q8d&LHkvEn=av+|+aXgIWaZ^h&2UuP(eTvhZR;TB4q
zahAnKz4M7i7O2tZgYGV(&buE$0NR4o-no6>ztJ9{>r3;p@%#>4%Y*;6f9ch>wA>s0
zv(m9L#|Vx|UZ1ONPvm;X%J-RM<xw~9b@(8y>S|+<FE|d4K0r(NzHKbcY7VmL)sqa`
zTc){0g=_ZDd@ca3&ZfP&-fAB)=(_7~_2h*<+ZhjC%H2!$w+c05KPqMXxxRc;=doU7
zq_`9*3aRd7`mD9u`E>Kr%v{|KH^<iE63=MsK-vG&hC^u9ZWY%X>l<h3XFn}Jw5R#F
z8MJ{H%hAty1-+lGH*17}J=dF}bBO%V*mKDGUeC2Kc^#Ax#ATx&zs|d18TO;F&O-ho
zGh*(2xonOuo9sNsbmri2g&d(mQX@xe1q56&%JWR@-Fb@!_{w!n(2?l*odF8OwzMQ+
zG2+CPr2dNg88ety3w}WwuP-r5lmU#fCi`lJasAroq~@ItRcZ($W<Wekjvuzu?-4Ry
zbucqSX=^Je08ewf_nd~T&(b+K2Ppd+8||%bqfJ2vOY`#)DWg7DMN9zx=V*J{^vrV~
zS-((RpcJ!0-g`KF7ZB9m;kpt3o1j`>(~~5)r_PTQTT+mn@6ni7UGZh)q9+xrKWrz@
z{o6}jp8<axXn+369ZeuEgE}q#5gzr~n^hdAI63hXcBjzEI>`m&jYw4-sy3p9gE>Bd
zq~`c}Sh`qV$)d!{JSD&Uqrl7`zu}uMwCYn^r<hOcP$HbMrG|-EiFf~+BkVH^Fm=m6
z+!y|W=tLfqpkuh+g`!vhqTeD5eEjAKBdaTb>+ijl7SB22>W_Rc(kvR>MC5-T$DYvE
zFHAtk5BiU)eAVcEV~Fswop$|f*%Ww7ya1(7u1h#`T1b)CS~=1dK)C}4<G0^p3z@$?
zl?*^K>Z?<4zXRV^KbPq(6~khcq&IC+j7nq^!fvX3{C1S@5|eFq+N=wt%sQRykZ6eN
z?0?rU+CEu?_Pr;xn4Q!o=*b9@0Yq9QLyQGxtbP}}@%k<jqnt7(K-mP<u`+!F7<=N0
z{IYJurE}$CsXnnYYRsNAAF#(`yt-X=)+-j2La&$`+!4~BOGmT30SsjS!n-S%ne{QF
zJRfDLb$sI%X2dYoEIJBOWx;o+k?kJ0&WCA!R*vLWYk|+R%N#AXeR}J!VydDV(MztZ
zCFb-iC7O?XUXEgXt`WZ3Y`${*T2!)&y^)B2q@(Y2BOD_LNv}X1-w-h^K5ci?^Tofj
zN=EA~hrKOHXllZvGdwvWPN?I<1$T?<l`B1&bYy_@nUAHH_xyR2qs=4X^Ks`xHbF6?
zWy~kIqJ?uW?T~S=(kZwszr)%UW9p}B_cV0x5Io`e=ff7jE_|rjgLcw4a=zcv=<M1u
z!_qhM-?gKHmZ0rFL*L`-Kyd*8jP{pC?{(A~ubUc7>?XH~)k;M785-MEI7a@$<)oL0
zop=;EKaxsyvf<K9Jr;(I^@7`D5#k;+xvSVmx~9k2Uc{4G?`T_vg`}Q_Ynq%bZ5r6L
zO<o4#c452FgSymd-u6JkOHUsU+q2d5U_Llo@4CQW>jDnQz9un&(dT*QZ<|kVpLA7%
zlZaer=b|@U$8iDECh&}y2NP%)j6QByNbk*^)+N0sGOT9-QfpSXrA15;KVO72k1J3v
z`dJPFa-qBX-2vA&n#!l#Pab<aG#?tv`19wzA&ew2vlz6p3z|WRmT&+fPeT)wNt<Bu
z1A;R|EUi+aa<)XKicHx!JlLZwBI77K-eB;Q7`+maHV6ohO4rDgMq=S?`mg@}ub3Lz
za)iVCf`Lv-(kq{>UwR1jj!zgh$KPtkO9%&I*!S?Y7=zPap*n2M2_+>^5P9~`bVLlr
zFGh;d_kn=C{|s}I&5!E3hCqPilsc~oNw6Wg+`@58+@&|-Y)A9b!HhkTaMv+-t!+?>
zDL59;4GP^FXlc5ml^VMLCQh91x=mK6{Z5|&4z~rxe7Cy?bkI=TdVybX(z&U(`$Y}Q
z{U(#OD#<@Hd<^&uN(WkE#p%D(^JKh&-z9=6Fmic(d|WLqxmTQ|Tp&C;a#_%}-P(k-
z^rCb|i)Dj6+A{;%3SW9kz7|Q}6{u54c(QN20Kn|lV96xYbX^;8t*tZl+!7|#zsPKq
zGXneXi#%-rOPO#Uidq~TZ7@Wy$ggC*FBYb#R>BmP&pvSiWq{}q@F{Z1tUGsum01S`
zvJKH8t{a~A^T4N!Gd?#~)P81udFetmwJ53F)yUuxp|G!4vv_3mz43cP+0?Qye2{p|
zBj?I06_#sUfG^PC5Qf2yq?~}s0G>(lx8bhK(JIuQqIW#crXClj6?4H{4_xHitIB1s
zv$4m9=|}aIw3=Nt=iz~H<vAZ*V$_x035Gs<!`8HJ{A`RQY4<6&q|Wf=nDuJoB{)P(
z2l|rd-pE#;d6A+m?{paxuQl{f>X1I<1Hu?zlJ_a$`Q=0_5BBG2GGas+#Ox^NwT9Aw
zlS0IgY8Rb?-chtcxA@#eNNZOWo8>Z+Wzidx<dZUbp?1_`ayizG+g$e>@~QT*#5WwG
zTjNBx&<fhQf)pHSa63)6wt>^_lk`xXpy%@OJ;vloa?sx75ql?M{AX<g|K51Bi5G9G
zQ9ShNtPRodO=4bx+sm<dzd~oD1vhyKt4hALKl|!L?4$|_SYfU1Vu$){phIOl19O>h
zF0Mx^V~KHd&_(}S1s~l8UAJ(>->E0uh_hMIP?$r-S-W-Zte59TCAVVswaxN|*NxrY
zJ+Qg<dX&66_U51F^8~jV%Vht0;q#+T^<*xUH%)d*HQ1=s3HDe8(C8a@upII*R^jqn
zth!c)6xwNFD^L-_Z1eJZ{9_<Qx`hX0ML;(CO=}w;KUa|+XHw<@f=<I9hO0^6BljW`
zyr-AaH?^!BlN;xKn$qRe=X;d%wzt<l+-hudcpXMcP54`2a$YX^PRf|RefpeP%9#4K
z-w~=F699!zTWty2dF##m#w1JY+-sA)gX_m2snx93!;9*1jtE3PPwEZaxbnaIvBeeD
z?AQB|^z|J-O*`@o82UeHwWv!%k2>8{PXu1_;K?hi`7Y8`zqj_9S0kG(yIi`*)c-@N
z(*d$fGh>$H5u4aT*|#n5sc|!|8-9)o12suctqi(4wTG<!QA5Cxis74Yjao0vlkWYU
zZYo_`%?h{$U<z>7-QZo}h%>$M5~_=KAxUYt?DB;hCC}fWcmsZfUO<yw?D(+1-D`^b
zY-vwz?;6xKG#Da4c*e^o?dm<ZU&h+#Tf3?lj}(x4Mq>fJK?wdaM#?b*N3~+$tJ~P*
zkz#opYisgy;6nh3N1Huy7=wzDFhEnUr#<;0axbvy{V^7lWFE6lgJ2~LW*Sqlo`N0u
zaA(~4sz){S&ZH>erU=ipE5W=~3=;ND9i^Q?r}dwzNj&vt=voe!<iVf@)}XLtZV+?z
zCPBd<)NVA<^jn_AeDUtjI_WWi=v*Z<M|MRIba590vQh9`sMIi42)>fY_~Tbe{nzjn
zfRUPQNa=5v7JdGv6o8RE;UzK0iKWn{M#*?{r3=#4r_f=fo1K_e9CrD~%ucIqoqO9Y
zn6rd8Y8I^6@0hAo<SNf6B*yChAaZ=l$l|)r7kil!P}*Q{4&ydrc&Bny`2^>g(-mPn
zM7&w$Ul`L{aYyYiHl~|1#oDNt__7H>XXlX=3w^9kgiX&*&?_(VMGi>VphRnP&llEP
z{lATlmO9ef$V~kZ&eQzI4JR~A@NjXZt){(Yj|ukbLdYAnhUe+*e4K(PuBaM9*-o0r
zQ`@7o_w6r%vD&zIM_8)8M_Xm|w4kCrAL0R5)ESrELnF^p?SD3G;kQwr^3}D14!^4$
zCb#^C-6{Hx;gE{A`|rs$3cFVH#SqD}C3^aqfrz28$^7hX|4<g8`QxM3lk217k`#*^
zqH9dCu?TVaRAW-3QFH03^uvdSgrFp4SMmCPp5DQiz;1v()3&qCGh<*mm}1Pn*7MU$
z7SD0QmDllh{JhKYH`VO$-$8*m+<P!osTUxea4g~L1EH!>|I9iI=;Dum;LB4@jUk&1
z=rz56vzP0~-AA8-o(~nncr3{dc^avEMzaEnmf75Xf_zX<Tx~c6u#SR(ERR+9d4C=V
z(Z}|kJj*0k+h^(CL(kf#1}X6Y3hEKy$kM9Cb3+6mUYP~yAfwa^@-<5!Z=kAmIj#${
zx3+Ch5Zs;qx5BEF6$fmW={OY>RpYAf2L6sNw((FOp|0pg`~C0A$g4ANqB22%Uw*@u
zCN?w|<PWl4%w{BXd65XTj6JqOf+zyTkpY(y!yh!*V9F)}8%|x{?LP~(pKb(2*wg3g
zyn9?!e|O!cZg}#+y{Y}{>A0215Y<nsGUoH(NGE>!P1(;YGV;QXXqodlGB0uDyQc{0
z)?nw+TOK}$*I&za5yED`*s~pyd~AJoEK@Qn+}Ml&Q^iJ*4S64s<P0aKPfA-c(nu<9
zA7|uHLY;<)JRX&NwFX2-+MplZurO>akiStbFN{No>0QrAyGxNFTV@K~-;Qc7cOIWx
zfgoJ&SyM()kck)`!)F>_t1VCo+mdiHW~RxG=T!`c7VHiq-Jm$8Y@k(d*>v+AqhwtS
zxXgwwon2|i6if#dF`w&n$&_%s^)SG>nV7{DmSAIczQrrN8^2b61NAsn8%46}uv{vx
zM2JdWp>K%rIc|K7l-LM9A*w*?-FY0JDk0j6G@+;!{+f{#PVVtGuf<E)J&(6=l980Y
zg}2!WU{O5nR<h})OfnGq5iSn-^42>_o!zk8DdizIen{t2)<+3sOjjS`Mw2$4ExOii
zfwju}4(==*F4PhCqJFR%W15of{sZ|%>MK7J_cKusnPzuKeO@x@Hdsaa*skw*BM|fw
z#|ONE&#ogYec%C^77h+m_f1UZMv5Eq-PoLSn5B>Nen^y3$}u~p@bJr27{2|=b?yK7
z!s`rVo}@4oGpTw}w;$0$=;`m?TO^^(1V4Fq=b71GbR1;fbvk}xFZ+O$dey#7&-Lb)
zIx4y+K>hr+{WSpH+?<ALe@S|4NT^RlbFSx)@118oJ=z;3G(|K&@yB&+*V?Q8DSGtU
zrfJ=-b(W8e;<b^otdP^C72V6N;Tlp2%YiSBrUUoJ#@RAk5|-R&>eJ(1CsVwvC7gV<
z{##sFwd7tU#ZR!X-Z+1fU#D<$aj{!~atcL;IZ(o@-K;Lt`?#823~ZA8UonIlv9x@u
zgEc=)D>JUQ9Pdf1idX;~KrdV0y@US#STRa<hWuSei2h6-j#&Gq?%h_i+g|S+K$7fQ
z8OACkVxG1&uhu6}k|Xyph7@G4rqRrU*kfNn{=vp)>6;ZDiTt~=qDQDo(lCB^%Atm4
zjhzvS&8v1hwRbGKq!$jE2#lTmOdY{I$seu+T{7zK_O?|D?1Nhxado0GxUDJ#l-%-$
ztNN!u<H)dYA))$+7O)OivrEnIR}bFZ2c}i}ct}f&`cI_MpmmErcWg_b*B*#bi&Y{>
zXemSW44|@b)uAE3TDM5A*8*y?%s@&-l}hsC@3{#Z`~B*dx+nIku`f<V8B?uNyoBEX
z5I#T?RkgYM1_r**cS*m2<$^D@Ba$DZN);2(XqY-&eYFwiMpL6(0ZvGjTe<`bSGEgn
zFhV=Luht?KZBV3j3C>sjC=HzM`Cuc+{A_E3JNKR0b@V4rFug+jv-J(c9m6x~rF)AA
zXhcU~fdEy&k%p}C_5E@F3P7T~P<YyC!&auKGz!R|21XPsEY->UU*c0P4U0=T>Am%`
z6DQ6kEgcR-4k8`E2FzMKLER%TAKE(bLmI}^G}q|A9#BIw0qYD$0<X4}mBT_66?2o7
z3bD^P<G?b-2s37hdS07uYL8d82p{Ikz$uU(eUZSK@7a^BuY}Ttl^7qD<7NPH&HIGt
zkCVnT@4vamzQWqf7XpGF#GcIoS86bXtvrsEXNct;Y1HO>f2BsbP}qOW9Wy&%bt57z
zJ}c7IWdRB`5zU)&8P04I1^FW6mD*_pt+C|~Nd-NS>q_svsr3I^1a-bCl&?KCALV`h
zSYstCWF)&Lg3Eo9i_J<gDdGHe>U;H3CtL5h-Rl;EIdAW)o~V+aQhv|E{ds@wfL_oG
zV(@kcE57%tWFoZL9_gj)(uHPTR6OU!J@+TS6!R+wkuL+#&k8v@NIoB;7V3A+B-x5H
zvf2M&Wxl56h*YB}N2#C1+#XE8bhAW~{Y>;J?|UVQ!LZ*_xhiup34}QpjnM72(ay+$
zs^Nhwk&>O6$piAald4H|?r}5AW=A)DWx1|d;%x2(5R@&aMcCY#w}DCBa%b--?{FHI
zO+&r9CeuVby-j}xcQ1k+U^inYEeh_=f|kM=m{mCOo}S^ODA0P;3fifarQ~0jY7H_^
zU$}I;!YV=2+x*ioI`PKPR>qazbQjD_R27Di;g$55Xro+!n+9_N<XrlXY7qO|{Cad$
zw>$8#p?p@0r>}3WT76yv^0MgZ#k6rF^sn1!{m_G3eVy!AV>2amX3rq1i&qpO#4FAX
zbBc7`6&yqhWyZ?E>N_y@aIPmSI5;>XUo4Lq@>Nw#jUfzgN+?SHy6ntoyoBXVQXnCX
z(hS8R+}i@3L+Xt5bDd?w(tTee4^~LlEwg>nY0LkfXf!o9$IN^WpL&rs?{EDJIe-y(
zjUxYmHYyDJ*b=ISi}rgwwK$pK#xw0z%z_RlCu)p7UaY>+<65l~^Cw6oV!DJsHH30z
zh(eiBi!<O9H1yr_oeWy%X6BlyZFfpZ7T|a62klVJM#;a;V~@A?Y$rF#DK<+0pqHk4
z%7#M;yS}MrJYe#=bG_L0MojkT=;+j02ypK`CSVQg6>gk*bo+Uv6r|D(hx;h=#{2}D
zEQnG`i=`U=)yue!^4;2U8||l5N>@~!{P2mG%?JS71K6AN3lPzZu*-QGom8vUM(3m}
z*s3OrtY&`;w1oLIx_n6D9z;1NTA!u@v=Zxo{8H&R6&&dMLMc7ct$8-i=KuQAad)PS
zrO=?REThg3JK4~*<(_&2_1(QXEd<0K6D=X~(LNX9>qyuelH%eseJKJfww4Ekpfl&-
zcsjb6cuz<@)K<;_5nv02g=TNXRMphffZi|R8B8)yfjZoyU`#>I+dNSdbWE)tsa1w`
z0R!mn+4}!IdZ5Vpte0R9C96(PPb5}Qg?z@<=_yXHqT$4V!HPAtxHkDpw-+et`{q?c
zGDVFqhqLrWAM+Osxg;D36Os>7hdn7@VsgUI<`m`|dS58{87ZL1fmrOYeHMi_wBc)t
zeC0Yj0_Scdx0&<%PaDe~eZt7(UzSD`nE#OZz*JDH;sz0g7I6yluG;kv^Xsrcc($nA
zyaYWcgs7L#0}_(-!dWi1CmHFGFqTiLkh}+yqGe(7!x$V9F%x=p6tus5?dls=xaY-)
zL3P^Gd>3w5yH&EDv92;-=wUP%l=xsjYXt`!uGd@(IU4uWYE?XD`T6+em^SH!=U8-M
zg?J^^I-07Eec_u%q=}_ercX52?Q}QXtLZ-%T^PjItpF^*6i`{-3!RfR`mW{;fG-@`
z9-$N~E7M)xQz+}jxy?B{UorG>GK-wbw0T!S%DP6Th4u8yY35~jJG~d9i$2d-<SOPh
z8rIfBZ4~s4rg)pzyto3G_U{KAoW1BZ_qDRyMP2inNM;rUg`i@lm_vwX>W7)DmQJKW
zgFQuc9%<($EoJO=tKO^VV*J_O-#5U!Q3P6Tn3yEj-+t39UR4v-i7{Mz7Bljgn$UZi
zpH9F(DtC*yV_r=aVC-vjy-XIM`^zHkI@S|?$wIp7z7*s}8gPJ@a;GHpAO{L4E}<1i
zpvkp4JrfyvFnPobQQbRmt_TeAKUZMQK8ul(_jS7jK{&yHw)}}EB0$NA(w4S2?(VCV
z;ccv1vvs6e1~NXjld4AFSGNLQzt1mRRka;md3qNn7+FRIW1BQpkFLS~kW11ga&oU<
zo4~z&1NYJ<_l?+A=H@{Db!)W+UXBBh&R)d}D#5higTwKLn$?JP5SReJ@Y-$ivu}OK
z<3E4rBTlw`uWC}rC74l=BR#5nO<RX^n|U_Oyotmtun(134U!dP+6#Gy;|p7UYDntq
zeN3pLb_kq`bfh5jid@V&uM59XNPWgg|7Ooos%&IGFTf9H*4O`(vyTyE$4aXwkG`FE
zVuCrAY#KLBE0A2$4f>5JRc>b;i}Hly{)70fujNxyy(Qo<3lY5m1Lrt}|LG<{FV*db
zOXhn)lo`>Pz@5$cb)Z58{Y@=S;RYYsh1a2QBSN<H79eT;40moA2WH}^Ud@Wn3e}8}
z{#eWVk24=S{QJsj*k|DD^36GgzVp}X3_YW6xJ2{XRTql~g?e9Ao30fdSif3A7oY5M
zNjLK%TJIUOx(Jp$Z)-GTV#{8vw__5~b+!iir#Ip`J};Ptc{?M&*wCFo)Z0a9wNYO=
zVz*guI-S$y_DOx<N#U_S$bDDE&*l<qsSPQ&crP`|*w{U-wi|R24uO=)Lq{?9dH8N3
zD%KMJb4VIdYRrebdNr|yuK1h=|1R1+fSx&sjf=O@jobES!G2s77rBRWb^20X#K-e)
zmKpC_V5n~J*5+?xdperHD*Iu^+_NBXXKu?eDe6P$^>Zc0<u_Ti(&1=ZTVF^RXNvi7
zxAe>FEZCRV4jDstufKUKb^~F}Z`3ROs>51=s5{f0^VOh>su3JaTHl{pY@D1^^=RRE
zTCqI&bE1jU{j6iPh25zc-_XP9Q!PjTG-JN*(*L9|INzT55<;<RIsyI0uOLutd;mYO
zzkkib)c~dKH{M~n9?cf|jF*^$GJhE&F|rozu&Gy0;t%~YNAKl7DtSG?!Ea}`+Iwq%
zpWn8{YtAV%4k)%4Ye_ZoSLj^Nq6*_ZdH+F}->eA3f2E%y8xSJo{jy9oL|!)>gtSZ+
zBq;lyd;=Ye7yE_2Z|u@C=`(IQ(XhVrGh92I`h&RakRIva7bmy$WVB?YdfA}Rjj|gL
zWZ3E6sRb_9Ir0}iY?0LyZIR@ZhP{Sy_pROi-R+H>(+^j5Mm<1!RAi!2M!m}I<NYZA
zOtb-wz@9xcn#~B*3GfdI4hf*1C2FLsfMA7(M-y&c6LCWhzTT+9#H>&DSIbgKGh%Zy
z7^|~)YXZ@#y1K<ZI|zVOzb@@;o3_u0_KH&o+2005G?m~<WXPE-Z=PRnNCU+G=Gni0
z|HA)A-gP-i&UZqnzqI#lSgvfZy=YM<pB+(2s)eRhyMY8h9&eU#J=~VI$e$DZRdV6B
zSdz}IDyqMo*~R?g+^QSeF=AK6nWmoq$D3!iA$$UY;>Z^JCNGwf-|b(jvFT)8oRaKR
zMpXu`MyOMYD=oYlo3ZtvM44>$5%IH*hdWoG69!AK0Q(+*NzRxDk@nYZ^6fm_gm1^O
z%TEEI7440vauI?x8;rcdTZ=!xA@>9G2WSKNB=Rnb0&rXKJ%b_6>*SPpuuljy%6iLI
z_W)H64%6(G>s7nVX%Z!_%sh;e=L1qaAAd3}dRv(G=;>Z|<(|0vkTn<d@%{=uV#2x}
zm%3r;%8J51b~+%7-~3<^tY%0~^lzbU9vp`yJCXD7Yw|G|RbEHof;*Rx#ileNSYaM`
z>4G|z%GbK7bIPLT(0R<as4*Z8W@E=YFEZrsSlvEWPHpaEY@U7A6;^VcySPi|-bst&
zQIIkVS)ZoJ5~UM*OHN;0u89vRYlqA62OPxb^KAOalRoeXJa>A)QerOf$fu2i-x&(%
zzA`Y2`L`EtThzR9FD7GF)4P_=cIU}^65yl$L1MvX(gjOAwt}oy5WOgg#upzJPK80$
zWb@_k4LEn3zZ#A{`~sN@ddmu_<@oFL&hb7JdUpF@VC87Za&kM4^FR8F>vu}o976V=
zgzO?C<f*?^Xa}04$)kU-)j?UaH|TI}I?^Po*`zS&;85-dS!NxS6oKW>nE~9HcZ-$6
z9*L(y|6o@931!XR7{(xUe2B`r9N<-ZGFy(Xuvfo9`sU~du3%Z}<Ue?eHWXDasiP%l
z__^7iUtyN_+5Yn|N>=|{oW{9(uS;JeWDKksi3a_5As{E@WS)PM;QDq|IjS<@!=Ek|
zkg?m|Ui{<bi)$EPwl@!ifY^=jy+!NEr*1Y$VB9)BImxgdtR}46LG0}805Dmr(&z+4
zoens4S{Wn$35MP3upw}hRlea^FhDnz8H*$d)b#zS(>f!#IzF>rG0&UF<GTm85Vlcw
z!ZY6WKm8aX(4_Vm@3Zm{v@qNo_6}B${4_F`@<_>et)P8BH4*58bm!GH8ja?=3f8p^
zRnOwBl|?~hSvv2@ICvW<8x3+dB&2@JkvPjY_HR6-AtQfr+dIf;hqBYQ<LP%W58%|N
z+(tcbn!g{^{5g5|p-I7u;U6mf^67T>+CpkPXtMLC9c_{XD9mWOiy~EE=3fI%;G>=L
z{CdgRFKWiAn;(AZF4ujk!{hS@#wRA2rx?AAouF#!@zoWPm2hA)PVl~^kpDfLgegcF
z=BWF`3dlMGretx_LK9~;5KN$5b#51CvULj3szD_d`-3Fp$mLxaHn<hcCYJw1O*npS
zdOEYXv9bBNzSGdzGT*}|T1_H$n~4`4|1|`UZjlDdrHlM59AJ<~Nh6vAc>#`(`iZQ=
zMe>j#-ARGsG>aZY_~)_5)ii?pw;44mI<0zfZ$jv<>nnP$*KP6++SR&HXi$O$OYpI~
zmI@lkvm;kyKdFl%^kO|^#YcO@wwos(*PK0XBU8nr8q-#JYewE~udVW!T(~w+<rF8@
zQO&oHFYQSvn$?kbQu5sHHigLadTV7>5hChbZ+IG8&y301M~gmJkVw{yIR^Rtv|)a_
z*@foxEp@a9ZqDg~(qov@<1+Q0WY|^N*oEdszt%_LV<L_(oV?h_Wo778Cv2}#&{FHj
z@VfX(=~&&;WsJw3XW`1b;*){m6VRLZl$h&;U2Ufu0`{>yt%eE5hb5omx$x*(iyJKP
z9(?!vS8P0YzbWWF>ay`v9IB6HxD?vOnRmu|>UAXIj={b>#~Z+0`4c(L`Db`?e17{`
zlr2XT50&S{wrikSt20{;7Y`>7+6L+~Qr2Iwv3RDgD@s6ad4>3E#6DCJZJpm%4BFe>
zUu@g=7I=?~!*wece1CG-|6(V)WxF_p5!cy<yp`5}xan6}JMX?=&wEi+SMnFC*1~co
zYNF@mxpxEe4$LgOTpdd@{<hR+t5Tl0X5$j`1h0&TqX)|s(q2Y?c%js6|DBfIHp_;6
zCvm<m^>e4><g&C*Ug}Td=k1mfiu<H~5}Pl8uQWO~oRIDqWoT2P+?SXJP1kjNzNJq0
z5`cN7@*NM_8c|Ja&T`C5vcmR@zn<YS)yy)%xPnc|TC4^qts2ZxZE2O%dNdgOnwSja
z@##|4&pW2)!=NR~ZnWd8;18GoLz95$@qzv1b_iu}igOfjB)w82{1^YJHb9w+u9kYQ
zIkZgr137Y9KUDyL`Pxoh3rcIgnstKlo9dzWJ|5K`I9W}=Q3aCG(>-pKDE+9Lh+ow%
zA}gyIxX2k7Yv=ja$ADcR*4QjE?@}LHySEj3v??{~BGg#L1;0!>?Tl@_z!<Feih6A_
zpj8>gl1YAuI!SJT-tV~<v>lIgtFHziV_<s9RW<4c75}P?Q+CI07|JLB!P#}lD!|Lf
znx=6ks|{3H$b273eLQ}r(U&&v-Y}MALvj!x(=qDwi=r$!4zW|h@wsMAKupHDt_gx>
zUvlhiKU1X@=53G`*GFqZC|6chf=7MM;ktp16ZlQ?K516xduwo4QPEb*H^d(Xuh0S5
zPJWcQ3Z~vFY20uCtAMVQ_X+)5X;NkR8fi57ZSHv&H3C{W@Hn*+q9F}zkR8@F<15JV
z^X!SGe<v<XKW)I$BuMRvrcsEz%@<LhvR{(b9rO`&8~o*Mi0FFbElLbNkLRkcVWD~f
zV_U33qt10ModW_rtps_YTrG6`3z`Gtc+P+I2a)wn`WaE=SBBH5Kx3|12_GKFppslN
z(YNY`^UK676)l=3jYeyyHXyvtAWD>EQ~CJnPLTicrOI~kxdf@tZ1)o}auZVG0=Ij^
z-*|Gry?3*ivHi^yEHPuU*g4Jm9-ouH+=|{@t%!Ouo5KQ4sQL)ytnk#UOYrN`o2mW_
zXNcbukbL&pVBgaCzf-sC=fOpQd9<D{bm0c;k7u`;!d#m&zo5b+ZpW8e28A&~>WXey
z+MDrM>MEA3!oL+Izw0$&+GOU0*s=heHkb5cbNs2>kfN@<DJz!Ske%_pgVMu!`T><5
zq#|lF6_lEcD7f64P^ak4a09$}+4T9v#Xo;MV|o)f0iu}TCA(pC2B73N8YchB9sW&=
z+)E3fkTib^l&22lel8Xj)TO#wi3i8QhuO0k61}XVDxS6l@x->p=&P?bg5UOHk2+GH
zP7fV=D7lZ1mwuV5|NTd!=|OOtoKPI+`yksEdttKN>n!6)ewk+@SMm>8H#2H~5<U}&
zekbkDGToQ84*a??;K;;WPW_*UvL}yYs+WPomzi15vs(SAJ_AWoM$QL`pr8=C%T4uB
zbVIXPj#EnJJv(xNL*^yNb4(^PRw)@<_@3__%^p|+I{LWau=ogOq6UyK^|g<0?YWT^
z=of7YT)fGN;QI#z2MZPqUHLDfng%8^69UIX&xh<!Z+aG)$7DCt9!-HO8$*L3jo*es
zx$*5F4g^99>xTeX(Vun#`#%O(@w!^j(fFzVUyE9SRbO6`!+XGIF7S_9{|*S#r&9)z
z+N2#%jf`aRqN^{9FF(_ar8v_@A_%yb*z_D_qSVgj=A56}Y<dRnt7_5HGhcWd3F6@r
z^T<BOnC^HzINfGW(waL66ZYX%!VQ+V$aeuFY59wLC)tvmB-%qNhEBOnDuc<&_-2Nk
z1bHSN4R<u`wqb>&<-=eQn<Tw!U;A<jbdA-MxtK*Cvw6SjEstrdZSP8KAYy;m=}yws
zC6Sl<03LK6SQV4-C`myIh;AV%$dmMS<UsfWFjp;W8j<$~SsX7Tn;9Cq$hb3{P#uDK
z#R=e_slRA_eQN53#CRdste+-{$9zA>)IU`McAKwVqd5PE4-*<@>x7c7lU_e<3Y^No
zrE*R*mh#?oH->&8Op?EgS=+df=Nsok8$RbhT2^#)-|sqx2E9c2_^V%0i4KTFHi90C
z2W9uCdqg%*7k?dpIj@9V-RlxpWZdrK1>?>fbu5?#UR26Dg~7nerkLAsT&|i&r;I^Q
zyPTv1q!YyWr9`$<-nWQ62rzFJ`IYQ>TpJE6Of|@*KB-bxlE3HuH;gJ58<Uy#DVq*S
zR@N{5#jr2#6Rqu8lqI`ccO1`+o-sdrvqWF;llb*;x#20g^-Fv!obAh|H@au;U-?pl
zL+ox-Nk>ToQ%?t1=Wapi_i3li)4wDkQb+_=#Bn738tlRKFiCWJ<ksZR!@ri-+%!Tp
z&guOE=z)X+oKU2<*4(-R?5pz`urOF3X(;6mC^Q^(OJ>c$?kztJ#o>H$ffY2mqSn=e
zot+(;i^kzT|FymOOCB`ptj1j(C?>5QI;4(mQldhK^b}j%4U#vvS`Aty1#LVy`8A%L
zQ{d9KEb$E%Att9p*Ul(SzT^+{8fSe5m3<;^+2J*+>Z~U+7o{h9`Qas-B?e{Y&!)I+
zg#yW%6Ou0V!s6GrIqQqOxPpY*q@JI^9Q5q8HzCSxK0({Q!AGA8*5x_XQC<SEn+rjs
zPt#(p(XxrK5A2x$3zlhj@7vpae?C4%XY14+;Tc5}Qv|@0q)LQO*)G(Ju%C(?Z3}oF
z*bc68y8&u3ZJ#ukJVpLB$iNfO<^a&5C46!Q2}Z0#T6yztEBLJZCO_+^CZ~hXT=jCb
zda_eyS~XLcQ)WtCf@a(WQI^$L@hN~}NV=0pTGx~kJ&va>iwr`82iyMqS!8Fz%3M3#
z24YFm`lhji*|btx_2jZU67k=Edd7n6h6p+Z^2Lbqb8<4ulkgJnUQ^>@9pMlZu+L+7
zhc)QTkLJlor>8Ur+B~bt;IM2{o@tuLXLT_T8GC{A%EK31gxz>Hq>YxW%fUS{(M9t&
z9A?@CWQ|IAgX$@<@b~M&$zH-(C+uIfm?A?elSQ2c(TELn^)${EATSRTBc(vh0onhq
z0!Vgm2_0o8A9svl`y4i$T9J^zBEY0e@hi2yA|!vyFb2`_cHO;VZ1Sho3=3$X*RGu=
zROp*Ul@}DsAJdO{Wu5K|Znmk01k|J05!}7#m`#xz`Z~|?lE1p@7I39yfAV<DD@zrl
zdhk;j7gk}}!%M$YXYWQ6PfnFw(CztkS$HXZ<RA3@dq?0(6#ftO^RY_iV{!-BXH%~X
z^tGua*yubmAk4_Ct!8ZEkAHxFIC++EsMC&?EJX;GSg$gd*Z0cBzsTeZICTsH8!XV6
zd(gEYh^}Ks7wWw*Zr1$C^yItzw37?Q)bc1+b%S~L%SiRd{UOYKw#&?^u^)=g+to<g
zL#$ZO;n}#pFnnxe8`+~45^p{eYm}DB5ei8dX^Fw}vcf*}jHrg=A()3&kNU%HBB?za
z5Xs91{5QKc&%#0uqnL<_F9s&{_U|v>cVLni-90Y~&*7E33=g-5#b<{-cmavI905t+
zs;L1j=#}M$v-GK*MTOD21YTBwxWzR^lM4$z_l2ET|C%(9Apuz7;6N^9i1Yj4Mw*d_
zFq92)v`Cu_-WFi;6#nLMv<dQRYwKB)tM32mGfRCr;Ao*A7#u?CIMv(=wJ8^HZDcT<
zGp3=wb4~J2i>GsPcIXaMcxXee?R)71deO=-s}F1F2plbUdwo}P@ng`z3#;1PAit1+
zV1JwWGDYl{@U4(HbxAqhk5RWrYlYg96-N|}%nbmm?5jMEP<#=QSNqMmRowZ*lFmm}
zIJR(EgVEy;x(~WUWj|nn%3GIb4g<)c`Vb0E#>nbQB~n43yP{W7u2(VU1>cHHFCd_&
z8B*8S{<~q~2?x#BI`X==hKOm85luxb9*_~pW5iYZa<1eH&bkl5z9Se~wHwF=7?KU<
zm2kA`w0@Gn|HeB3!BGg-)h3UG7_l6G>vZMG;4L~>oPwHpc0&EwW956BPk=T3zj$S(
z%r3nCs50QF=u?ml2X=R=OkQW%dF!-;J7_+4B;C>x;di#tt(B~P?BZQUHEkIg2T$dA
zR&9N^{eFzxfE-smS}`}}_i<P^ls`^^if&ZOb`igc7~9)C_c%3z6}3@eOLvjZv!XY0
z4y9fT-lJ21!f0PJ_Kcuw8p*X?7;L=-c!vpj`fEQjhPRg8ec$0kI=!rcG{RFPRvhwR
zek!<S?6JyScod=&8MHVxl>g6Avblo1sMjL^oJR6MGtvbT|3VSK;g#?<lF48i8f1W&
zZ|_UJDF$B$m4uv|z<F2lH5#Ty2Z46Vr){0rQvvELqgr!4590BFX&;tPExL*Mf%y+b
zb)ogXvQDkO0N1{4<V+Z`c8n!d#Zlf@T*{eQ^wShh-|_`tH8qmm064Gi)19a2u^UoD
z%NU?5Rmi^+gH2<szhiWVtKEuk(UASA7hmCBH|Be^?^9nyp=v+Xv!Hsdn>F#eYz$O)
zx_3$5<xaU)Dbb7N0nhZ7UUsO8GGFxWK90GPAjr)G84QrSr$;W=@zV%l>y8JgKIi)^
z=ztekC+Mf!42(flrhm`1ui~QTSl+l8v5%ZuV^!t5&I7j{-&i6py&L$&klM_dIa17f
z#QW<{Vv6;&%)I%`m*m7VquOxvVY*aXz^)=EF#6re$T{@7cpD`dZ*!)s@x-caxAg7i
z-KG8pU*B<3`0*sQ*a6cyn`hV#m+<L?xJb2A5FO?vmv<g5!72Gio2sG5S`Pa?ngfp3
z5BiM?tEgrSWO`O+*0qD3y~zw?ImxuE&E$dCz#PB7lH9V}a~dwh3gLzcVPmVp^cgw<
zu|IDAG+yNj&*H2lR*q=Yk1jvqMkVkT+N$0s_#Ut2V3RxI*J5bO>Y`f;#5;48CrVEB
z=~*55%Tm!1AQ4V^y)OS5KE5k12#!F=82&>p_CkY^vzJB7<&(=%kap4uw+qT7HAcEm
z>UGgpk_O~f9!Xs`gae=LqL;gRmpJz%9>{}${ATlY8OaaeZA@O9?23!M@dW|+Na(!N
z&h|8Ay*Il2r0<q$@F-K5&%d<N76onaNxBL<+^1hVT4MaKpNC0egV5JKH8YxklMAQ@
zJd$b!SadkVD!<B<86jQ+#_9Fkb$?A^6a4qBtIQ;xnbR2{{*bBn{1oXnxo)|hvh(vw
z!=ftYw>S9N3Jfex{p4>K;gy=)am9lRp*_5nj?0|yR`-b2%T33Tl|<*$2?`YCs>f5e
z(Vr7P^;w~1q=W=tQ2Y;^TS9#oLXra}qx54832(jMjy$~JHkzL<(ZBJ-jW%dOy`pAQ
zbUmAqYBXt0|Iwo$VVjNbbFP@i+%t)E5+23!<5%Jk`E>fj{sn%BE-4Z$U`71<^Z`}k
zQEPc$oBSs?#{B%#iZN-g=^m+v&vH4K`>VC(DM-taubmz7e<mU&D!&&w5!~*+^i*E;
zNqq<vQxqqy@5JD}tmofXx*FP=&LA@*D{9Dr9t29b&Su-!D5h6F7Gl@?2PxNhAe~+h
zh+5mcyM?^3HdgL)Su>z_0-su3kp4KF3d(ENzUKfx&e|j6fArTf@<Kt+l@k!5U>%fr
z5}0iJ0Q?0ood#y%6i_%>^@q^glCns#e6<0bY)4?JM`V5H@a%{7WpUr{0sx`*l8;Gy
z=#G<n)ctJ9mnSY($eL&<=DWD2em)*IY!KHYBnl4?AKkbDb!unQhSYHkx&F|(exFJz
z>5Ua4Zs&+#&8e5NnHf&!W2+^WCRxnRib-}P_)OscK9t(8d<%z!=`f4l_PMB7qGe?}
z`9{{<G}+;abv*Km4Vz`%1-o#+JjJEgHkRcKOZfAP&KXac`QKc#l}PVm4QLdZ=l{#f
zhINA7kksdmW#QC%PaFsIKld8j{=}HNmWIs<5t9KhGEk6BEXFZGlJi7f&-vkdxmxW|
zFU~*XlEodYG|=1rgzoR`6#!JnACJQo4^WxH8hNP)?{AFzP9FW)nLB29qf%ki=E1AE
z{qbJ;t2Dz;f=eImV{E0aOs^L=NTTiY-fi&PFty9^UD){*xII=I)O2w{?<UNxKEcm6
zpWCKVZ+Egj){<DoSEsL^__^^gJ0Q*zsIBZWMNY|?FrL3G&Ngx@m<Y^MXokY0pl1~g
zh>C_$?AlKcNH6VUmBAxe6fu#LqJRg=mdJI{Fp^a`D<oB=(ce!#vdupg?A~Gu6BLzH
zjf@eU{?sewrzR#RAYs5Ny_@d_UT@A7++u#KPg;Z{hkkq3(2%h+l3=fga~B|!YR8{m
z889@cb4#mw{pas9MN%b*7O0;yX;TYWg?I{7j=VB+EHvT(4Zp>DV~T3X_B1<`&81;k
zHH{kvA_lH~jPiEC;udAgO~4l3{I(O+Oj1yz3%MZIm|?Fw|JSX@=k}R`SyFO0kX~(%
znS_FRDe^VSFAxCceJMUg4TkF*Pc+htgw4HU9-vc||775i>03`UOuSwxAgqbr4<Mf8
zP1^?A>Rz=2^p?2npUurMecG~f>b!$mv&d`6;O=0x3=aAX?b^f6DTq}$xS`JGVC>Rg
z^oT}WZm4#<AB7?bP^Yq;QW7u8<*`K@yED~QKPa3raJ&$-CKcu5Pvk`ss}#%FK&pXs
z(+P?YKL9|&y_aeEOzjEq9F5|ORwu$JYIAR~GI<t6UUr{CY${dYovGyQuJ_;(I^rvQ
zNq1J4ZK5;b0r$#<mgBw8$rWfN&KF@M*$Z&CPurQFNM`f?cfxF!_nc$1v`2;(ZZ=L%
z{(ef6V1Zl)IDW7cC)nxwoPAG1<e9k+4dj=<TKcKYQk62Wj*K%!Sf3DhG^W4?`RqB@
zllW$^_~b|+^rkvJVJWZcRmvIM+>h`GAh(CwKlRGg%l$RCB9(k`O-eb$%?y3w%G;g$
z&V;pZn|_qeUpjq`5z25I!b^?x4N>OYfH4=v^`0*TE73J4EYjI$rkss~qjf|kg6)fW
z>8*_pK|_@HScesRnNNsI|K|@J51z18ELi3EWP0T(Z+AhNnAsP1qWHDZ7tYV2KeLPG
z#>z7@-^vd;=pP5OmHvOEaQh4#`831;=BN|5_BUvEV|tEKaOvXWF860a($40bX1}@~
zr0jvdK#{yyl5`b^pt^9xXxq_dTjcA)p3p<NqdF`ie`ju!BHs*2s^*ON={(akspi2d
zG4s~;BVF#%hh{Xhp99xuiJc9v9AlvN@4b6u;=sqp!3v4F;k_6i>w3ASM9;c&HbnNU
z^C_DdRs_LD(#sO0Ku<@HJ(v9LxIb(APc)h(e@oe1Z#aez7mRVCJ$+?}s7feNGI7i<
z4Xww2epD(|Tqm1+|MU*&okFn-<co%sc2$ClaI(L%b|rBn1iySZ=cy&~I@?8s#*L4m
zB;4mRnt5URf?q5%(&1GZyAj(V7^kcrgLMc*PUo`B?hG$iH!N3|nkT8{70mrx(E?M;
zI|}&~6PN~RL&BKQT?6Xbh3UD?Cz@mh*=XK*^ircVJTaxyAw78qyhX=0(lB)i`(XL<
zcem)c8XY9;@~HlqI9agi*!~lFK4?vK?=GKzNQf~4v0J(O5S5NIqCs<<aV3di__XF4
zQbc|kbw0sR7A=}rDQ!wqF*?)8Y2X;WCeKx{uAX?m%WpOQ78!6EZ`P{kz5j+ZF^s7i
z{*~@S9Cru*mjYPAy0KxD6pkQ1haA8fOqVxG_oDD_G-a(vw3lps>y_<~xDi5b_!uRM
znv4ScLzN2Bh{EE-GmK#k=adG!&uTZ0>@q6zeVZhksF1Errme46R)fDtAgMn_>{&h;
z^Rq}-iGC0}Go!emG{|Bs4X4&h*HP<TXw8gR1xk6l_(*HjTEe{3t6g(2iv7?ka3<)s
zr`2Stzd4m~lV0y=v~HG?$j&#7rv9}-nIX}#dG~M`28bJayJVN!BBHVic?qn3K2(mg
ztJduhIE-_&m0SnLzqIApY&6G%VYss+m*{Mxgf=3U1KXu-rbDEHE9Bd6UYcBOA8uY2
z8WE6l>I~3eJ&$~^`{LcJ+ZD;^9wCdIB_-aUN;nYcUn65W`5@?9(!4BEy{w!zQfb6U
zE+!$Yg#xQ!e9@DuM4UE@ei*NdSy&-e5}sGmsgIb;@qt0?d1BQH`Mu9Q(Y8xcYC<V4
z1dCWg83Eo)jF7KIC{&qI?b4~@coKV|y<0}B6nn+tX%t^xxy98CVtm+V3)gb|XDIaT
zFjgXWLDPKUbU2vmJ&V`-?hoW#^Yh&C-Ec=T;y(!6y719{XY#ef=|5|l6>ILJa-2<#
z=_UmVq*gBY#nwb6rq5>0F?Tv>Ow*g2o1SXJnE2H6B!Bft(gc8tuFbpA9&;&^kz~QI
zJiYgo_1R_9F52XMNWN3~`~~9&D!G$=Zuf=Xo%YZP^HN97mVJU&G;)pe?QhN-41<vA
zp7V;9GT*-4FCDo;;AeU{e`h-=AUI@SKK3C5(B{wKKZ)=Wb1QM;{H#}zY}!SSWQB>c
z3Xm@e(Ki@E@TSKL!KSv{WdBueQL3auN2_K!*<GN)OQ#$m4tMMd;4h(k=a<na?}<It
z_9?f+Q^Z@AT<4Vzb|5+ra;ac`;FEfZ-t?beP{{Ay6~@*iVC5}#o)7J-vwB$I9}p0*
zSfB59_B``h=CE}afzECWO$g@~Y&&md(49B+^5cYi%|ZTutzJSvCJ0pAhJ=ushPpa-
zu>^EM8>lioNP4@z_{10y%y#}&jOfbNU2YW~2d7oMd^qqSk1A)g`}&@$8+Nkp5`aIU
zRvY2jQ~WXWXm9B;3)MJNf{o$v5B_$US@#S9pN|#Q=mTsEnRGz@01UR_(<L_bMw7@`
zB@0~uhBPXEaI3IbLnvnGyH3Ff2Pu_E6Tp@zv)!<~p*$wrd;gO_PS4M3H&2l(DUN-0
z5FIjz;qvll4S-lzj6zf`vreISzL4WB#927~9E{rY;MSworwu-2sm$z!9W^6NMv1gL
z1&MabbT&#J=-vn(b@#2BZTnL@J0=5`wl{zj)0`vAP%Mnd2%;_NU83J23Fx|k=SOi3
zlDig-^R?kS-za&-ps-Kr#4<hHN190^RvcCF&+uz~nCq_`1qs<ZoJKva^!uovt6N5F
z9PLax2=i+ReK~hbmcpKpIlgEPs9$pYG+Xa_%i9hH19sTHlH^3F4o#)^G)}g!acrXx
z?K&h7!Zh?Ium2$`cQue)=bh1M{o*I&3+9-;a)<M)bQ~<R&WtV%6BR@rmn<E9czy8X
zzvw^MP5J2A9!VVZYwu7*rA1Q7e=ztPra|<{FF9`GBr7;y?UzJ6zEyIKlFK&tmfB_^
z3yjTd<)xQLBnj=j1+6}_w$7h2hx)=EaS;;x)q7bT+oj%%7bjbUUP}DH#wkXyn;+;s
zW=F@)?FxQs{2ajyuPX8@XZ|>~s7dx(aAgvSh4EwJ*R~yY##@56IwzZqj}61AgLgPu
zGU@6`m`z@a6i*IM;pbKY7yh`6p+i#t)77&kNnQ5R?A*<DffX&FW$Q*3PZpoolYn*w
zRG+rFMAiFtj4kWliqzqMqoaZ1rCI}0R)MB#>$y5^k_XHe8;vXF>^nEfpC=T<=e(~D
z?c{lXmb966b$WQG{fm{9o~v<ZGW}1GWR<H<ahZiGEBwXrJnuqY2%#b>Q%OoWN@G5z
zZrJ37T`~v8@N<B$ForJd5L=g<p!;Fm{hmPwlfm4KJP5P-r)PM0_vo8pLEZMjpwOA;
zr7!C&Mp)J8=vy-`z>fBI%kKX;I`2TLzyFV4vzw8TgoY7)x;EL#yh>zd-i+*VZCPcN
zy(*jQiqs|COXwOIx6rk(bh*k6$++nzbd&7g`TqLPUm5ql?>Vp6^Z9t_mO4)d>^_w_
zljJ<ToX2fNs^!fb92pte0k%=|)6@R?Z(F%BOMW6Wx4O~LjL;jd-ABU?S1fL252^R4
z;#y{=Jh@BvKnpA_t&dI*(Xk+R&4P8>*zCc<SiCN1<(|-_@%FBbjUDurkv9iNiJ!(=
z2^}3nnsT8X1cKq)zJ*t^O3^h!1$c0c-mgzf+Y3JW^B-iR7`1cim4B?so$S@UcvKNt
zanhR4PXq{Q%cCf~Ee<qR2aHp)fX66CgSKmSwDH%B`r4^uN&C^FBv}RF0lu1$eFRNs
z)ib<Li;?iyZ5YQap*%)O%zSbT<?KEcGlojOc+$n>55aE^B4v6R&7-DpNr`Zc^SZJ7
ztYS*n=s&Bs%YvwCYhAK^ivE9-&sRqe9d6DEdu*w|FfR~`%}aSaHMbDo4rb3u*fnT+
zT^*hh`<WCaUtnDn<0zwYOQ-x1#KC^P)WIVXJTBVQiL#-!PO53VD#w}y`9Kvdx9vO~
zFO6SCZ^f5=6A`Hm-NM^VGG7)n5LXuI#}t&Pszr}W%<Ty+Dl15It?o6fXNz0==0G!k
z)MW&!s02S<VZ=ZZzdFxvgN@2!<yxpjU7@=(0c<(ew`Z<g&dGlW{u~P%_k?O-$$*e(
zj|4fPSe<DEK4?<&7<A@#k-XH*>50k}_Yip_mMfNhmz(M&|Esx_4zHNCaoxC(clmbb
z`6Z$lXUWw2t}1!0<+~F;eBO>mEC%qm_}){yhBmnq%XSS>uFu6SJmtiIih}n;^BD5j
zH?JRJPlC+<ls0ro;+u=&ha!F=Xa>+ZP4DwHwMAi9>j|IEP;i{`FAIBJcZHl3+agkH
zf8SC{iYRFKeCOWxOsq44b#TObMe|A<5*3LCt-<PbLfj#FJ&iV?{TP*>Yzji2Y$_U`
zq(y4Q8X98kg{z_87k+jn9F6UzsYiJlC$hEs8BA|GEi5nFfVCShm%W~VvS(xetclr!
zU%eYU5#4ia@<9!pPVJGt?@qJ)V6ayQtVaBEmSRlLBlI11w24c<9abCfj8)VcMab`J
zzN8bD3_N~^Xl3{L8L8PGk3Y5Wof@2&$YC$Bh$dfVg)H%lv|jwgKZR%yOMM8rqeRPQ
zoms!`3xUy;&GMt3%?F{og=;W@iyDNYl0Y|)s|8x)rZ@SLJ6rvzl%2{q;^$USQ)99^
zpQ`;t&;M%c#I;o~DVrH)-_Qd~>A9X5hoa@>WiN9j%u6<BMpuEgzTWX4#l^+bG=NEP
zlDSO>3AKoB_K(5;+mQQ|BMqP=8Hsf)4*&N6<Z0fQ%(Rx%c2+l)-bd?HxVpH^w1S)}
zb&S~YLCs7~mDROfKC~&dXJ_}|pzlCg!P8fRXM~@pu@UnkR0tcvA&&6CMg&rYOh=kW
z8LaJ!S?BeJJ%GxsHZ+5@)3bi-hU#_J(T=k`6@MmXf}+yDb_JvU9{$ovuC7+eHY9yU
z3^+wFg^^AymYAAp;&3JM05@?9wUWU$(^?4WQ+%|jk@H+xous)AzSU4tA%g2!omd!2
zLUToh>!dgW*|lQCv~2B<dlM!7&_DasohiKEBYXLnNb}+@Ka@wNl+VFV`uFl<tQdxA
z!)Vv2DM$g<<e5Ty*6&N^4R=+Gi%k#DY}jZP1BM`6Zxp3j7A7I#+&kHjgseW}f3mt`
zkO2=Dk@H_lCP|kY!Z3blIDF61L9#J2fF#9|xJ}X`vnq@ZfRA{sR-k{+shdDE@Xb}V
zb64P4ypFD}1p~mh2Dyy3fR@(L;Kv(zR@eIKi_8$B&_Jm6Q*+Yi*1+y4iAP4SM>ds1
zbCKs(#O19RGi!(kEs@2mQv1UU98O70^)%$Arad##uiNh=PO@9fKo0X`$7|46SyDpz
z0w<D7hHuJNyB9?@s5(|~sjumL-tYH1|8a4s*6BRVRzj7rNy|B&x#u^_$HgxkzF9i4
z1vO$-zGg2~o%Q&86)4StgN?XNgY9!!qwq7MTa(Tnf%amd&VtgHYmf5?3G0a-f01RT
zCm)$O^}F+81I`VRuIKb)RrO$@%|b_`a=Gs@(EcXG9}b2-L|fGjC$S3a@r^{-ENqn=
z`{0;fIi@Nm%ayDfcl3Aupp$U_=n&~q3lNG-RbHf?P1(V!gC<#HNP#K7*z9}OyY4q}
z3(7?iOLV%KjCnep7m)y$PptZ?MdMSrvlt=Q8QU4;?CM@vEILi{L2R8#v3)3wO)`D1
zJN?#d0Y%>VR~O^ne={9*{KX76-12*=Jnm|;=O|6HqpI4;lysA*#T`{-T_4eq*<X<T
zER{>lkq*?M<>nRVb-ORbGFEV$&z`tFkdos25p|z8_TD|0lVdp@-Jhclyj?DiZ85PL
zd<j&CSJOd-QuJY4sx)IEq@-p-eZj{yZtJifz;VbsFZIAE8Uf}Y+xz<n6F^uugc(76
zQK_5?O3hIIb1tqYYCv;H(|#Eu5F9~)=w*kZd@xrd_nfmsHF!jI;#G*3uD$*o5N;x>
ztK*NwB?29l)cddCL(z<i#|ub&`BRqnzxpk{;NJOE9^FF+lXGbkG1E+Qb8BVQO7IbF
zT+Nvn_9owz7;~kJ#E<8Jh`dY8P2Q1_*2SRgwfd>nAJ5qm=UuS2p`73CxfN};?d?{J
z5d(QMz}gleu|WFUpD49OwMi(B14woq9y|on_VzxgNK}2gGlY>q^4x+Iri0Tfl({nh
z5!VVjj9t&>Jx3Msls8fiDyE0S7brh^u2Y5aHsV$}Ep$90+&c53d=n)v&wr$&AOQDS
zjoP|4X?t;*Un8sFR(xFCRlw}|ERe4?^yAHkEA<VJ5vDMN7^!zIt@?P~rVN`uVCbG2
zkhu+j>*CPxVGz9Yax}2K;g6zaRVQ*m1EOw&|CUzZfmn%YrKIpuMs-;~FwYh^XngvL
zHcaGl&t?jNOBP@0WauPUzR4D9fi}K6liTA9)b-BODmjQc{kK<mt}dZZNel8}Adt6L
z5=yOuaDsdj6~)&6sLH3p-VHXk?6r=4>CM-EE=t=>>nNVP+&j#J+l4|br|TM|3-$9~
zI>1d*3OyyvWdD0BZ6GUC%XZD#Eg2r^%q8F~=9J?=zjlc}M&&)uY+q@*nst6?xA5C(
z+J6#G)+&_x?sN@g7G6n8@_C(-Rd-i<&*Uy=y`68>9LM)C<Hocjt)P@UxWBw~EC-+e
zSg!=U_mY8eYr+1SsEcmA_T}jUuhL}BDGqqQwSzO810QQ5>s>3#xofl(r~Vwt-c;z5
zH?2Ekz-j_23L_u>Axs`EE^It}*yE@FR*RO?&C$ngz3e%4USGsALRd2d!>6#w!N;`P
z!sXE|;naK29{YrKxjDx9hvS_s0jC|6bL+8FU!5`$mPZj8<DL!p@4FEP)hDt3Yi*NS
zIW#X6C2ds5%bxpLYLoJr*X(@tuXN<4HKfMwToVw`3#BxOa4S!n-*mgUUfyI}&FsL+
zw@LZyte&0!W5f`;H`@xj0&C5ZuhnSZ<j(R?9PZx_CkLy;-fLKB5_^I(poZUPDC+MV
zsoMvNV~uIyL#63J)!<4v`T>S0!l?_76aG0bXTp2^-6lw!w@eMdFD4u@V|Rhr*^62F
z()#ZHF_!zm@aU(hF{o4qaEKvi{Zk<jKC`_?u>9fMMMfo4nYTMQCQFK>YTq_@cTkI+
z;F5AMp~u;MirkC+TnFCd?U#S6tp#b_s>zpX7H_Cr`iP2oB%T~dXd(ga$_cXKBZ-f#
zvym;KbG`k>pH!VQq~!SOcoUJ#!2Ecb5hD}^LPw*e<x+PZjeL7}I<X+j?I$&d$IY?Y
zO=G}I&7Z63oEWCWM4p1W7)?8<o)^=5bOS6gIK;CityhFh-8{0z9=y88Wf|A1*>nj}
z`Crln33GIKXhU=QuejbkQxg8A@7$H(a@P5~&Wq)B-N@mazTG^5RIQp7;={LdS0rq@
z&Z<&@cJfYha(c&7NuJV04ECCLM4_<qDB4=t)1VT4IV(Vo`n9YeA9XfI36~m6sT0o>
z3(?^rtGxHpH$dW`qhxfLx68ZZfs~#vmJejkc$9t6q;(-nwl>nFRWUCkn08YbQnS_m
zkY|wTA6*^|qqZ!uwvo@I^6&k(F<0!4W(@m&cX~V`A*+c~{Xi61M_**9m>EhZ!KLv_
zV9e}fD*rjjM^JAe&xn;3-GUcnvJ9JadeVaC40n$3PZYB>f;QQRJ3PK{&YKHG8rEY^
z&MCUx@Zael+^N&XnH-79-rciFGn}BWIr;A<Wx-#|v%&ZdK)%#D^gHND2<ndo0Z6Z-
z#phGKj;A?l(czJr3KDf$LzwS}H9lDRN5zG-A7<AKJh+8D%A+bvEOGeXw%=ZssYv|e
z-HkVdDpd>7(54I0mqyw268(f@w2DVwi+Nvrhu~y(kc-rx?By?aF+nj;a@u3wUlwG8
zTEgRZTPhCHj{b?xB`~Hyu;LT8P$o&=APd?>^5(s3;E?;?g@gC*v)ji`LX@Gn%SH#h
z(L3W2^oQd=KSfyMIHz|P?>>`NI5WMqU0gqTZHb{k*uN_PeoJz@Us{Ov8Q7CI+bO79
zCL`Jww8D)pZ)2A95Yq9t8G0e+R{xMs(wv?q(#u(Ru}z~9IT{y|(o<y>ZLpdw5*>GN
zY>+>5+hxwMtyrz0NK^48bg_^jRIj8XpslPS5Gb;L@BaarOKe2&oOVDP)ml0n7$BX~
z(v^thO_fI%_&^kVJMhX0vhD%-=w3{mL2f7YHZ2Zp_6I`IMd)diyxdDEX^c>k=U{H|
z!}+z=K&rn<FJPmR!GYQ9229ukNOq)DBLG|)EVF}gv%jz{5@(CO&JMgDonxcNHh6(4
z>%thg5zdZ{kU?6CUNdlAfJvn<rX<y>aVn6E@CcLhR7{456*`WXy=f?`F`9<!X@HfH
zlnDLr-hsAR-1OY+{DY9}EXA9PuVu3$|9eFK6d8E(xsK#)^)Z!n-?Nwk;zGK_nHEm7
zmf2s2-+XlE1KVmAH_3}Nvq6il`!PZ*nyY{Clm;dPj@Ty|uw^v&v+ZX{5V3aQQIPUk
z?^Y_x#*_G~GVi6o3d>Pi8J$ZTMmI|3cU&r(QhE1eRlN+@iHoO;$*akUB+YVek|zBu
z^o4`F3FEadBOfS$tP=+`M`VHR5xmsi9rGAfu`oO*z<ZKuOJCS9eTGG%7v1rr^@TnW
zCRNyUlVM}ZmN;BENFOjWjs71Q#I3~L31Zo)g{;L0O1!4f&?!b{%s)ES>u(4+P@F3U
z#f))Yj~FVWg))4h%hoNu(!0N-*UKYzC+s}DsI47K%U!<s?cD%KdI&V<{9oc79M+aa
z{NF*4tN*pPys@wdAyV(h?KpWE!Mc!_FvRZwff}Q0$~N|F1H=7ip4U+$qQ5IiWlP+d
zJTiH4(bl>HSB2p657oC38OeZNNUEM@oZ>9piZZgi(JgNw&9J(%d2f&(g6*!%tQPXf
zRvoS9aCu<K`_A?lyI7Tny~6G=EKkk-)lzej;#U(S&h=c2ZzSfgA{6>epU)7<<q5P<
z5(@KwEUp_oJQZsWlW?y!5?3>{d4SUg@Krvs-Lq>As+l1-den)$mhvgnRmyx5N(q{o
z)A(v;OWc>OS&M|fmqlkn(P@64{v1@(cY_#vy8GX;)Y$-iiuOe&_DjarVffMPWXWsr
zL8m8I%dI(T^5k7!H#ogqXtzgiNRPBPbQBL7I{I8rbn?}An)g|5^o$$$`g?{icTOnT
zRIY>4Z2h)tMLf)l`_Jta7|FwY)s*ymIJmxKc-8H-JC~TupC*wRK1&A^&`<Ok{Ipf;
z9PA7Cl;wLBGW^*Io&&!oqkAH)Ph}j0OoX2IxEL%M`l19O7Byr*ds7*sbO7?r&uw+;
ztQzK@Ce<1SVS+$z!1Zl@?<1tg!&zAyz<^QB>^5=?qkZ)%vil_T4AMP&SO03)`S9A=
z>fo|gEm4n2^$YJ=ON4J`^6|N66HCae?Xq=tG87>KH5`YrQL?1xg{cwI7PLH5YU_db
zNs)OKGxLt)|M9I-8z!p)Ze=S?f7~vA9#unL-CD+12V^^ksZlr*&l^I0xkQi(H)<+t
zok0`Qv8b#pj@k;I5~~ivzk=2cP7-<Fw2o;$UQ{%@WrFGV#I3%O$7tl-a`Ik`J83{o
zR(T%!&e6U!=Y0=h6~zu=@+)1LE-sFFS|n9>eOMC^w+xLnX`l&Hex13YaPTI%AdG`_
zMgs0j^(GJi6uykGTkkq*x(h6Ai<^2x2PCR`i<J1P<-}n8Hf^2Mmm$omcWUjPD?5{z
zrMRTHSe`1$G0Qi0!;B2ZE>d+mmcVDdg2>!<InAUDNF3NQ8mxNh_Iv~mkJ4D?4LN)L
zT(aqsKz>tQ*yJM5prv`t&F3G*Ig*#q6r^sYoumIV##V2sHA&M?+$X!KpiWfy>Sr7t
zQ{d$o?d?6c2I4`WSh%S|lu9xtIS%>q$veR-k8|^{6y%5_1D8?O%~OEV7ilu_K>Q;_
z6`Auhr4;<=#YQDL&l+Y^FM^-;GK<EV^NDx0fmT*<<$~~wz19VH+%T7!CbUKh!v{yY
zmb!+Yqoa}qZe@=EV`bH}9aC~|icC_X$sA7g9$8PX<8|-JT!;-x`c6A~@PuBwGOu#u
z1FJgrN#ci*jQ=1eaA)})`+k4CWtM8>Y`yO?67I0P{)19w>WVQLk(Z9Q)z9qhr3>9*
z;xjw{L}oD}hMjVi_AoeCiG8)s{Uu&vNTmLSW15LXo)vR|$n^tGv0U*ay~@L8TaFrY
z2t!KvuCvXbJ!Wg=%g`2*>1)!Zt?Rzwq}~9mRr99n(K1hCjawU?0NTO{M^Oa%BIwwT
z##0Mgl<U>IalVKF>w?k0I!{82O8Jc2#e}Ozo_6$ndGn+pqNwU}Hq<^}{i~{jp;9kh
z_O?%6EF}^;%yBQ?9Gyr1^CtE{>m$^`3R-Kc?^5nAqVJ4!tX3j<JPKWAQN5CHtf^=q
zZeY}AhA;k*6LDF<WvWUxY^cw=<8ep(K7}u+Tovxbd3l^%H7&7rTb1}X&IemTlPZ*e
zq_gM9hMy%}aJ+05PU=|S1Gz~vUt)}4;&9mc0mQIZNcjAA+<ex*;$m7}OZbEOr9A&-
zMhu9`F(pyjTb->c3&L=+Z&jCtd~lXd-+FC_(pAh(w)B_y*OtZ8ol8d0w*(Wvo`dEq
zCB96Qg-lu_VP+d^eMwCe$1>}gKXKFu;H)|iRvi0Q(vKHe!9;{v&zCw@eJs(*pX68S
zGxqqB=JYTHAt9OMncbi&VQm_SvQv1fpr8Qs*I_V6IO+qF?7N-WNIH7r?H9n<_X?W%
z8`-9mI>>&;KpKw>p4a{oyvt3iYB~)XUCSF)IwU_4|JuawXnJRlA1crOLJSojT58n}
ztn>9MAGJZTvwHnDyBIUAJRPzzyY{OrxaVng(+mD^RSGbE-t<J*c`{kvj~=SL=nwv?
zOlb(@1~BN}7wxJr5hl8c)Y1`bQzkI+<OZbM*f<-n<@;yympn&L5gq+RF3#U`#{RFV
zQTx>8R2kmxoZoKZGy;+>9+ANTA+`fBZ?;x#j3cGop=kh7FTFj<O_Z(6*6T+J@s;uZ
z`W2k<YwfYFj*d=aNRJ{!WBva86<QLHlKcoi-}GOIPkG&svJXNYVe*)z81S84wx9c@
z*475>0WQx>pJe(HLrZF!K==17<cd#c>ulhCBrasAHvLJ|-@^r+n*HBlmV&n&S!zv=
zDRG_z^-TcA(*%qA>Vo#)Z@AA~U{02OsdmOd?-a`gz1l)9=Tm;rJJ{vn5b-DJ#>s9u
zDo&3DB*xy}wU2#_G=d=JPvXl(?5=i06Q78$Qv`xL?uOxfm|H?ATGx%2t$!PsY&SIX
zi5vd71gMy7rEg{y-sf~@p3a3@*{Y@58PHmu#GV@CY;lla?ULoV2XADII@~t3y7I|O
zGwF>W)OYIh>5`(s!s$s>F`D~?vfaYR?b$J~8l9cf{W3o{3zPk`3MlK3MO`a+`V;}Q
zbWayhT|q8pSs~#s-StSy>fo?b_e4$c9`ztb<7nexyKTE{FRil`JZ8fiPF-7ZcVsnT
zW34}ch|wQ5$1A?}hA8|zU2DvKe0d)3{nhJEoc`U_Eq$hyLdQIXqqrEd%DG#R$sj3a
zYt!Yd?kA;=c8@!4NSb5h5&?I9wN#t%OU*rGlQh57RUaiD{>7=Co}Rhl4AXQ6Bn(0k
zGB6}UN~LN@>Xa0;yYz9UuI&}(DlMAO=AW?t<tII61G3ADjgIE<CS<TpxR|^G0E){y
z7_cg|)@|<5D@1~)zDnKobyU&W<_&Fy&NXiWmDAW<bGoZP`aH=O+hU$L!0<zR3w*kJ
z)sbHQSQfU-a6FYE$mDSo{}d%q=<#8nP4L%D106-9fmo5o7-MXV#EX9>E)g7%t@PfQ
z5-=k#A1ajUrok+P1o6~#JB5GPPz@)X>NXE({R%?H`#K6_A5!q*$!3JB<rWMT*yJ_i
zLVB3XqZ*IQ+Q2dZ0qW~BlzX2IXC>X{{T!e^w^Kqs3-IP3ahDTbz4kVhtf=9W6s07m
zdiv$Y#>PsG?%t~C(VsP*`2-7i$C8_ne}gj>AN7vk%S|t6CbDx4|6aOAQjF*z8k@aI
zuCY-1k5z?I)nN5P{_F;+TZf=nvbRU|a(q;C0n+)ETe!Qz|7W-L3C&_#K_4!$mF9a-
zV|}}}DJ2ZWn`C(Uv*QxOrwUbBdI}K8X*~SJDGrRU*nc!9o2(aG>RQfWB>U2a7Qxj#
z`z$(trK(DYcMe2KGHxWB$shnlysscj)|NA!<Y(~~m!Q4=?=y3H6sAC%w3sHU3qU#>
zhX?ohcK)p1xFy)VZAR5V;_$}b_c?4N+<ZC<K7yrb?7}eG8dY-xo-M9ia7+8_4-@y}
zuaPIhYCeibw5Xc3{~eJ9umWTmF(%D3b2}wV^TOpC0RXtT?gqn)Sxtjouev9%Qt^ul
z+fz~~<agYYA7z_Wh7@@-8{27-CP@x+X7EI}>xuTXzh2HC0QWrWXlierR$3pV28G&8
zUx-NIeUQ#O(;7}ew$%|oyh;~;e!V`lUS;?4Pkayp>*}p|9t^Hw*94XPFEEKqdaOws
ze{hAfSAH)}Ek0+?qF2Xm=Q>IX(frFay=!jxg4b{AqP1-Uv*5%V^PJ-sYppRqgzHo~
z4La$@A2Ljr4F6FHdu4>9v;?rF8jC0VXUO0$JO_d{eMkEhNB@90P{CsSx<^?f*x*+j
zkrQ^aDiXR!6&U|QpXvzZx86REGxgIdzqxVFe)#REm=iYj8IwVZNquePEdB#lxo})$
zN8$IXE%XgZx?L7aU%(<KmJ_~Wvb`_@Tk2g{i)s2Xr}*9!7V^kc3R6A;X%bpmgI<&p
zF<DNQDTyh4^fHf??~xL}INz$*s~-%BL^1ng>MriY_T~@wCKM@eeKMZ3RAtKv(<Ao+
z<@W#VOQv^BHqgteOIKoquABJ=qP#ml!+Juauby?`p}D=pyb6InC=DMbmK+2DF%Ug=
z5T<igC+=v!nYj=STaGt$Cf|7s%prHryJycCo_wfuqOB9*XkjSP*Y$CoXh~Hg?R)`~
zCA9#xIA<A;u5fb*mvFZOo2bdq%)Ia4J>FiGY;c5Tp#5_gDmC!P_hRY$iZ?L@dG(O`
zyfKHC*#K0vm40h$sE7xjQZy3VLaB|j6+edsdGO_evg(eG?zp|bpiB{W^=!v)jj_j!
zMeUr<*<Z7rQKVhsF3|fm1j^}=?~Vm+ugqTI)hs_D+H<5M$H&G~Lmpe}Sw|g=9p2S`
zFGZBnqLt7!vNcb4vz<{k_QW+6sjwCeI(R@73%T@QHJc%__$Nv^+muf|U=poFWF84>
z%YxZ@FSBI7Ye4>Cz5W(K*S5&?&3IK5r#S6!n&w1(O>*SsoLHE^J5O(URxY8#TvpnF
z;MewAfpsHC9YQscopR`&?B(bVk}@f0-&)GE$(ObgMUbyu(6=%%OT5|;*%4+5N=J3z
z)?%?{RyyAP0mSJ+IAlW1Qu#n72f)(~Mn=f?>jZ=TgZKgPOZ0oD>aSFiD1PBukr(B>
zs!w@5qC6_}21C0z4|sw1=CoKOOwq2cz)NK|JrXqCirf9xh^9n#o=er&`v~yVppxo?
zGt(SrWFqJZ-Z}evEP`CZdJ{s<&8(32Y7vb%=Ek(_H?tia!sWt<!K`1*uMr-03NT~0
zomhdE?}3H{i)7EiClj{OSK)W*%GZSI1-w5*h$eDdfrr$yz$@huOJ{{;I|`<}*~P@L
z=R6J4joDRQy0D0w2Cqp>dP9yG+HV5vi6yEFmhXl~ek###XF&&h@soL+5Z>C*!1tz8
zPn#M}GLHrnNH?4FNY&G5g*pOE6T+{1HD44-Bl!FT<qN~*9|p)`&X%dY#<~7RgZ8%<
zQ!w~IUDMr9OZXJf3XGmdVcQ#rg#F^9{kDWXdy*OK{hqp8%lz3#3KkY=7)f~B=t{Ik
z&Z3k54XH4gzdRXm6}eqcdrPtwM8t|?FR#6c5O+ECmu@h;vHWx^r4L|mEj#`}DNcQ=
z@$_(~VGhO2Im_u8SC=s^C)12jlAj}k2!Z!wIX8{YNMyfEX}#7m9o8UCosyj#zM1^C
z^063pHfTyK_*@}cnx#pxMYX6j-!rLwetqi(f39^&x)T}P7<oKwJXXx0)Y+%{sA@pF
zqVFAUEpLs3WcTW3ZfR%haX?*5O?L;rWp#0|I0Y60<j$W^ku-7nkst!t%zysFaZ;{^
z%*S_wQ3#0l|0&O;)nVJ2>+bPq7*9vwf2-sW+sC}gD&onCX5?6xF!MmFh=;U*QAM^m
zBtKmGJ<npa);WusmN|!xmAZR$B3tly-=E!kuYl{VR*FTBoE$h0&ZV3c7ByZOaIh3H
z9jaMeh>pfnuF$p?pWROs|7wj62Qk)|?Y}t2uUIWz-K$rx#@FGMPX`<Gpb9Q10?PHm
z23$5_v;NoVb%L@^R}Y9tX`EOLIsc~BxoEjl^zhH$S+J1^8-mf+;-^!Ij;5Yqa{^N?
zw&sK&kZ8QtTWb?i%&D)!9;GRRAgI<?Vn9DYE|vk}z$jz&u7r>2N-MYL6QopCHAn`v
z?H!zloEXTDl)4hU$gjHaZTE&-1GzxOyk#Be0{#AAu5CC9_JbCNt+=B<ideG6B47aI
z+tF59{Ze%#9n4+T2f`4_S7-1v6M;Vyc=^|mW03zjZc4Mh8wJ;R1d<2I$f%nzWS<>O
ziuTj)&>_tdKmgR$_ZrqTC&wahxA0CVk7MkZlul2mL6-6c{BFXwK#?nolNpFxz{SCg
zT&MZQZn)znW3Z6gikwlSo%Q&bGOEQ{JovJVTGr~voeNQ5D1*Sm_y~WEpSk9P%6mon
zFYJQ4j=k_I)vtg5#9waIoy)Hx3$j%3XC0IVkdns~p}Cv!s+=;)0Xb~&?)$Kd7mM#q
zW%-|Ls56L28d9x@H)7}%uemr`x^o4nl^w(@FsDr$4iDX*|5s`1gL4`N<@q~ses2k-
zmw`fnjkd=tqAXC%2EnRg1m2HMWAiy5wy~=!Ul8`$I{!vw3gXZm1b^w^=6pfzYx5`*
zKjL}L;zO5_@&1^{K#{0gP?+Zxs5Ms1)M|&)piR0sn|OP!Xx$_SCmd1JqN0cgfxc8H
zX$VK3ijTpU`7itbo6sKpC?<4t-v>H`wJZ4y%a3^@9cw$k#`>t+3*$gt^v_SiH`~L*
zlJ}gx;H%92A;hU}e#hud;l7lDzCzlbcArjP+R+M?u&_Rv|DT9p*k?lWj_LAyn(mT8
zYN#C0HX|qoBZv-)Kyi%7YZr^jkF5=<Z%{JznWt~3)Z%~q^LVEzU+s|eB2m>4XB)>K
z5~C%<2Mfzc7iR^cu8oHu&phY!8D3pn%x?l9*I$Q+V_o|i<}DDFK<d_+=IV8|K=Px$
ze}Dc6EPjlX6K{0}ifLaFv$$mh5(i2dSPXazDC}N%i&*U1o?Aw*_^HhWmQ@E*!NA6A
zQ|F2J;!gnOAgm%69#@=SY`&C=;K+Kl9pAEciiPP#H8%yd7P$>R^IH8xMMWPE^fgO*
zyrO{RXJ-Qd|7lc|5s}aCn;e&0&O83@Nlbf!FN(DHwM&#}RYJX}k`wIYOKqKOzL{Sz
z^Vb<z4f@_a8G_X)pX}|M?WXuD6`)7f#()mnd77IBTKP4hvG-yHo?QIb7Cd+G?X>QE
zkQ$imET%n%CMTCsW(nQ*<Kqqz4s?&^4K&7Ej6_I=h$6Mu)U`*(q=7}1F|rELOFx;I
zW@93kl`du#!ZgjwT2hQDV4m47nzHiAHIhyKdh4&m_wF*<^5OtEf)^nGu}an1(I?yX
zS5S8t#~F`uya}u}`q|{isJV`7u%cn@6F?W3H2qz7jiaGM9_HWcI3Lwfu(+Ve#UhdV
zRRPVs<$OJv;t0_*!1R<jMsPr?7C7HnYSA2`L7ez$uHAJUy9qvUvD?Sd^j!KY`kOIt
zYU`M`MPp-HUdthgf+(tstS@lCxwCNii{40Rri?P0`)!(hcil~_mL(<Mq^<@!3EcVF
z*;4?>8y?x2Ds|(JI{?m9jym>tWrZSAW_{C-Qcr+r`V54+H`L_INy(YMreN*94X0>M
zSKa)txQaPs-I<JcqBmpN9&dE~%5vIps!OkQ_2YbT%Adu_Fr=_f48D$5vb%h~@pcM3
zN2AXLai$x%O08G6O-!ubMuoh`EiKdSgGf*K7KV*%`^|i>O?|1B9mTI?eP_oiIbP+E
zC1M{cjhQKVQo4zjIPCAUE=h9H7#d#T%(Erz__*{xtGhO=l$p^f#)-<X7+JiKvPmPe
z!B9#%FRU0<Y*gdyD{|FCfqeV!x-7>3;}a9!yPh>*qUAZLMP7e{CU+T|01@ymBa*Q9
zqrTCA?BJ0#Ih^hJKl;<*@8b2&26c86q%r(p_ULeNgudRYzU)J2zt7nEHBH$0ymsLC
zG<FRB-GO<69U_<8;UX=cb<sImmhTSh$H1SNSm`*{3fnWt+)E8BJrTaR83T3~Gidho
zvrE(Kb*|?il*#*Mnd6|fPr1(mXn~F^erLl>!gl^TE{4D3A?#PbjjCmX7JdLN17&f$
z_O2A>d-tW3S>bSGrPED#Yv652kO6tSyy@>om`3?y7_HYJof*uZ8|M<@&XYh&AX;Ru
z>v`My9RI{ZNIHm7%K%4#O~YHS(gClh2GGC_5|$J^uaf*J`H88UAgeae*<cRh+X$dr
zJD1$bEv|i4=Qmk~Psro%UfFC=+bXr-BZ%QGE_&cJlY?4TZH8td);b&K*0w=k0)OeF
zW)1QzRQsy#0iAI87cijU?77*Ofle8n%sZ5x*d&>(N%Jekg!wVX!25LJ?)l%(CK1$z
zt#|5Xrk9kYc<;&^Ux2^3Aqvz9=a~R8fN5gyEBdk0iN!&#GP8vH;ijoN@&7=Xv7g!V
zXFP$fz82IX-V2%+3ukZr>;%0)kvvhI*n2u?TyWWkqP&!jupz00=-p3IXMDU+M7L;o
zzdBDawddpmHz5-_&!5_6qswDuZNwDDD%d@e*U?c76&ga0Q#c*gtjM3r8}xT?Ht2ft
z+)Qg=JEi)D62@BQG=(cHE&xxM;1w6<8`6AV0`I>fYVg7G2naU}Poi72HuBh6!sImX
zivrA7E45r{B_tY$U(sgL!<FOhB78|GA%^xC3h9*-h$6;_J`6Sp*FFbAP=ytDR)8$t
z5Ap3An!XpG^XWfUZ#k`t%1R)?IvZr@OL<ug4hyxJ`WX_uFEJswj-^(=DC$)x(6{-6
z1E3}hzKr#0U})cxU_}?lps%I;Dy6HKfw`QhE_gC)(Ri&U55V@e79=LSQuo9}1n(48
zM=U&KgyR-tG|sx-GU)`1@hf<y?^e$>D^g#sy>Z@fo=*EZPIG&DT2jD`yZqx%OIAh7
zr5`W1vt>k?AzZd}>r?Jh9`N>py3x{x8kpC);CphGRZRxA-)}%zp&`!5KfcPI?aG$#
zO|*`H35`m0?Cf8T*7ARqm8roiBOxYz`$iG3%G3m6`Auf^9Yy<oq373F=uGvNHLBMJ
zog6`lkUt({BWSMa$Nya74l8Rf%oZ+zEEw%P9NXfkdiM+?P?Pk?xTL<v5|wL%q@Yen
zEKZ+KQuP&rK-@F&6fi5)-LFab^eb{);jzN__~_QZd#{&(&wPHH^0_r4omeRq_k~+*
zNE;YHH`9)`bQ6wNH#=o(g5z8Fw&r`lx}O@~LcM#&Cpd|~(nMivm=ZCxg}mO})#~Lf
zWutt`k;#m{QDUF5a5{+-%Imi!_Z8vL;Hhe#i<p{v896gm8$0&BJsn07JL#CCHfsA{
zd|}}+b3dzp^xrOnD9;FC+@<D7yi#K!?|WZ%Dc2d>Ef@a`NG{SWqoeuvkSsPu8BZz6
z_g5S$c{}wGsqV~H<i<th7naQDyaIMhx1`4z-?|JZeRr%@lMAX^)wkkq1NG;GPz!bK
z5BxH5!>CU!=T@!lg{tsj8o4xmPFuV3+2TTpb^P`(?p)!4Py$#)-d;c0ybGv!JUrWk
z-Kj>bxSmpx4G4nkC(=|NE!MuMbAR9zX7c{NF4|m789iPo426podSU}R1ru3?_{z6$
z`{=(2eFQDVQ0fXhdkalFI%nGgFXxy#Ri!e&Z5A@s(V(p}y;z$OdfrrxyzNce0k7}S
z<Z8dphP7RQwql#may1DuSj0N7ID-9F+_ZyFMTKkXg3ztk@qKx>>YxT<&~NC|MFr%Y
zyOg(fv{>@%KtOE|A|%Ae#K+!~g#j^1>p8w0vIhE?o9~Pun}<od0HP74?lB}X6QI^P
z%g(puADjacjVa!MU~6V_UIUo25O;VWM3F!f!~W{yM38U9wv1@=^t~M=I?!;D*cSyW
zF~rPw>aGdE!na4O8316@7~nUj1pPZjm+#;hB`cceH(Y^yxa<5FQYnJDnG7og*`=$o
zSS1pIRT4Q#w+58F6$B2?P;Lvn?nw;Qs36rfZjyF*ZS35!;>u1)k|T?kIq>-jTUncj
z!knZ4^u3h=V!LTcPIqLcE-3=??C=iYg}CX%BmOCclN^(9OvbF$eiF*!z+Gw^RBUYk
z$Fp6bwU*6-&{RtJu%0KkKaErTR~LQH1>fkIJ}6NYE>A0H5iEU(w_7;vkv-R7r)|*^
z+IW*avDXV_UlkcPruE@;N5lI|l|$*tP$E`|UltL-^m1!n501}ll@zPmZV06I1T#s|
zY)WZv>7jh>AB^ehHhuL?sdnrhf!{t55+S@*6U)ub6ZDRvUv<Z`2N5!|L}unHeR(HG
z6WB{O88LrQmNCX4WTvakW2T(?g$k1bfO}ReJ891E`^N!d@AQ(QGSN@7j{S;XvK+I$
z)rQ@AjC^~={(c$ZkWNtZwKjK>kpRqmMR5`2Crj9rC6tH@dtvP=S`mNLn4r~NEz)W8
zl{!BF8iK$6K8raTqo$>`0VYp4OXqA5;^LiVBH|8TUHS_S1Vlhka_c0->BUNM3Z3(d
zg;WB3rmynU_34P0VRD#in@pQKkSW4fuh2bpLhe^UA}I+s$JV+@_zzdqh?8!~E$`+>
z#7y5^vMtc>M_q?GIi`E2SlMw)z4N+)tqbq|q{6EACwOEk(+Ct9hlC)P3Qd6)jAzWd
zGrUJX4C9M!E<pd$slw$sQ>%r*1+=I&nKu@yWE7WbeUq#)AO%nx$shjCDuY;jWyO(K
zDUzff1wO62@T3!c0PCxJG(uKU0I$@wpNeA)?3S=`2yNXuTQ;_Zu=?iWG8{_eh4xF4
z$Y;TI{~CT($V92oCIbs4N^$qaYCXl`4_Z(11{;!MP~Q0VLPT4c8DZ6Su-x?2Ga~If
zl9Z)N*}H=^RKfr58pSt9oZ$N=@VLeU2jH(~49Km{Ck<)~JAQ6=_N?;`X?c{-2DD))
zQbh7jI#hy9Z&&R_0aGpY=eN8H*QX@f^UZkL^2V;#SiUOp3K+~>9J3(Xd<;wIOkn)Q
z=>*p-BBz5BexD|!kL>R;KB$ewxBXY;hg!5QS%?-XaU*ZTs$)=lA9udWC0>^2RBpL=
za#wBA+SJ5!IApa?s%p)<{>+rJLNC_#3gqm=Hi{p!%T1+XZ9qja)A>iavfiD@YXx=D
zYo|~<JA*-X7@%RuH2{HvzhZ{>Q8F^HjPf^+5d!9GBzoJNB{rO7>a^k~m|MNpb-g?b
z@@d<dtVYoioS=ROD-JG>EaI&68Y(<;t~OkcZvx$%4(e;F#pAgl%YB=#`iYu-vc?YW
zB*u%^pZQO#vZO<VpNkk29v6F9lmW=BHvyCu1$PT{cpiqAa(u3w<Ct;gDiD|+$)_uh
zbGH19qBbB^JDwMOL))(sBKs$YDMHExG;9ArDoOLU2@F>MA&YW1TJEkO<W$q~qynIZ
z6q13!mKDMh08ODiGOpG-IA`*L2b1t0ZhNB!_iPin3lM;!30yP;VNXR1#-OYi+PXHP
z`*Z!1#|_)Ta4&7?%Q%0?CdtY-%sc|;ZFDaGU5rPdlVIF)oYLq+!Q;6d%ujeo_fLC#
z?7lS~>XFlQBKzrpL2ALEQ~2u)6qrbx!NwWctj5)0Qpl25A#91#H&Jj7d76ziX5<&9
zTkT%%a0leurbfbc--x=dT$_4%U{{MBCi5}4=jq1XrF0(+sn2?weuu;TFW!+|-v7N1
z>V|1<nU2z|L%#qJ&?aGj;rhC0Sp$8Scz>k#l4nlDpzIyquPEl)^=TtaR8!Q#kB&FP
ze+4kM*PnH>eyRMy!)86ioZ+9+WgYOQ2Ns%gIyKxj-s!E;3L;Ei^(6Fi4Vt*1&231!
z(f2AvSV*4v<cBVih#?l{a3e)7Cw(rfMJhsLr7P&2^^Ndy?)3CfJ9(U(y=GDPiJ?n&
ziOIwg8b}9Z%y%8F{5mw74g~G3omiJ|y}eDIc=6!2+uQ+byObHT`M?~dk!Dt!{_D0t
zKZED^{9|sX>Vnj$J?lt0Q#6Wn^t+hwZ>%vP;(Yc0ynU#n9oNIC+K5hq)Qe5+l{_@Q
z=)>&`POY7|E%|{zfA*KqR^5(|5WC0Nalr=j5oTws<)4>ce_9>0vq=Ulr;!*Tk#)d1
z8Rg(!E->%~TYtqz;ZBbRSi}nu*sak!Ior=Z2Sv)tW!?=u9q@`sh@q;*0r+#*w&(Y|
z+8kNp&)@9?lGOtLZ7dkd#}MZ4i$X2QxoM&3VyWVv<;Umpl_;Nv70oZ_$+0~exXjXh
z|LCONCCqDFl#Ag9$!X?vY41;e%E~(T)`Cti@)!HQn)|7yz8AL<M2?v2KrmLLjlrO2
z^dV71EFT0!mIT-;ha#R$M#@cVEmi&gifWmQ9MWs(@N~Iy1yxNKIatb{lIOQ-|Msz!
z4FiIXP2O$uF`{UEhk%>fQ{JY)hfOPohZHTt)He+>QtD&F7b(^r=tbrF{DWAL8ox+V
zg!9RVK1HD43b%2%-gFU(^jc2}Z(ZNZO0eVch2-)wi`T}Op0TjG{JGW4B*w8w;5n*z
z=!w!bG_tcbDF`q{9m8OK8ob{lp>=82Jf%_K8l$i<Ip=7*nGuLJIJXOoO&Q<sf}YU;
zI%|UBgt+a@>JK#Fu_1xf*JdLUBh4=g+U*w1C+X}C;ZLm%`fZx)_N<+{Kn{Bwo;vuF
zY%cGWN4hfGJ-JZd<|4rY#oKfz)JP$}3PLDO`PUSho9&sQCSM;I#Nf43__+D%0xSgl
zSp-uWWJ}}8$3)Q)o_?qz;6#^gS!q&g`IRmkFv#oZ#%r)X>(A*?fCxR{e*(d7r@jPZ
zER9?|A|2vSO0G{Um!S*Uyw8*hD69VchC`%7BX+Bx`z2N;Tn_KvZBvpbZUW1pJ?noF
z!{&+Oe5dZxN$|+*eebAXS<|vqkemw*yQ#VCBrTTvta@#?C+gsEGc@)psqCQ0)Fft;
z#Pk(t;S&xTk5u3PJNlP)^r!9U*E*`f!cWP%ds6uXW3#sUhS+5rZd=I!E3VT|l@&}i
zd!8b3FHUVxH2G?sjwfb6F#YhYZsfr!#;p)WOQuU8JDlQwm=iS@vtD@4_N->HtP|eW
z*laz$eqgUT)Ojg0dgpSosX5;JV9_52W~ihZA)lN)R>)e1V;^JcZtST?=`ZoTO!_yJ
zt&P$qu<?brnqN+*S?U>TpJH+-TOC9w<4vf$AChC90?|BAs!r3}$j<wFe{sS4|Lc(5
zeWY1rjb(r<&pJ5d%gt2h(g9ue91pB|Z}0xOigeGgk5`@H;L<WfiYc~^jSfEoZ{_#%
z`(up*q5A^~`vYDXonv6boy)y<(15%bomE3-O6lG<uJ(J=N(BH)U=;84a;X*vsG5Y9
z?Kz7@u2CJ01}xKGvP$<^pfwmDA8)K-WFTjQqj<L4(qF**cR!|lEswg=GW*RgndwJ!
zKQHRwkb>Yz-#g$=)V><OWP9$GL%@(@nPRs({5$oU!|R^XL9{v8s_PO<(!YBL@2$zd
z9s?|Nob$XRvO8R&64&4NN?Vrz2zs1m<ozwzqb)$Em<E@@e;bR}y*~%GDJ?H<I+&c$
zd4Wy?RVAZE)(?pB)`!Maa5^D;{xUl!zR<d!gQvNTme#~5d6v=@;gI&uPP#hPdL%>#
z48Mgv3^tOV6al#M;d|HpUPs2lCLmp4K<$~uij(*LI<>Vng5M3-I;^=$;zmu1@uIF(
z=zy!MojiwJ<|KNg;_we{djTw?)#qak;r9ZiQ*LFB<e8Yu=2>T&%Hr>>+rZyCA>~G=
zI`9P!fo?w@&&~f#@e(?=re)=F4uVbR$hJ3nsq~8dfp1r1v&cP%E0X-GYf~td4;V^b
z%QYX^;q$OS8aAkn#?ZuMLrl@7pF36AJaXPQy%ER0LlA}5ldwf1ibQGnwSLH;i_Hr;
zS&{bq@H*MKj*_An?~YHwynf*9kNPG03mq<Ps>Y70o&Y{MHeLyj@{=p3cf+h4JFd-M
zNK{SrQ_b_I0eG%V?s4{9G+rk;uZ5#gr{V!~Di@0f&R^&^=?W60Av-%;h}q=3THuM!
zq?hyC%{wrN{Nm@9!Q(sU@ZuWl#a<ao5{}pxU0A6;<oYx!5i=ZptP)DQXPou->D#GS
zqrEVxh{W1fOCv41udCyY2}$4CZqP$A6H!hXnel%34c*;I*jXtH4p<D6Prpg5)EnB2
z)`;8R>>FW>blrqq4D6IpA^xjwiV6+_r%RuQ^FY)c*#KbBOssnTW6P_np&F0Viu)2k
z(Q!=cX#MbTy?ncRh0LHoo<H3D7v@W)I7pjL-qD3C6A{6nJ8Ue+`#rmb@@Dwdmi$ie
zldDn$O0HC0CvV4x3KJ#~+nfI9G52P+v@F5CLk?^cZe^Oh&1?CUXM-WkPfc=Wt}7K=
zD^EX!IMFpu&=&okdaszYd;7X9UA#B)LRH3GA#L=fee75nZ*50nIzuN%Smi%NxSXSO
zx{|T)2dIpD^RKe>_8$)m-hqx6MB~w7Tf+XLJ-D@PY=B-NCxonlanm<2jn*ljsRm)d
zXmC|bdn{TL(@dH9Dq2xtKn~h%px)gTDs&nwS*vz+H__DcIr^rMa4?;)We<W$ghSDz
z0WiCrQ}=@@(frP?__?#+DOt&|JTXu<7R2bfKL^Nj&Fh4F6{h!g|G1%u78U2Y5vqPW
zz8t3^b!_KFMJrrfs$=+zE~<c)$zCVGh2Ce?IQz%~*1)l6?GGdE^=>UM<JYMbhd*n;
zGAB9yYz;Vc%pV=L5k-P?R4-XeU+|f*a5g#KWY2RpqaWWd3_^n?w{9vZq%r({zL|@O
zdRo1>@M?9>yCsx(N?u8B1el>~&@D|AEg+d!+YoN<Qv@~h7^3P&DkL$S5@T<wdHw|Z
zsyAWlc5xVo88Z6@40CpU{nuQ=!C{CtZQ8?x6XJR`XRdQC=!}6-TAhhIgjbLjJIyY-
z-#mJ`#X!CAGa{;GzVoZTMse(MPj|?Cgfs9vAxtQgU<6}ZtjK6;T3Yb3XOUEQPZww+
zL7m%HJL6kcJ+e_lIDE3K2)Qu#k^?eltmEIQl~<5^Xc&SGY>LSLHdcVmu!h`Gz+6=<
zW%GGE^M?(EAzl0hf9`U{cufqdJUo8!7HJgMVt(N_$gm7atu|Ylm}HnbnjuorZAJDE
zSRm4{hw1o9^yPGo^;T-&G#)j2<&lR87<CoK4G4MM6px13#3kR0EQxvQ69-cfS0t*K
zhiCKg0su`oLX8^&6ijI_OzWCj+2LI+;oDJbt#)`L5kqg3x7jE^A7!s1e3#wPW3U9x
zpY^@@A%uuHH7Sf#8F@wlCp7xQ1zagqx=TV*Umu>sd%I{E5s&i|xtC?9T6uy80(tJ1
z@fCZ^)NA9{veX$G<WkM31V!W&3ih}1#?v>3(+E?!xfn;U{WqfEgm%;A0k-4T?uN7T
zkiS=W{b)C&w^%h3=F_h-*5ZG3H9Zv0E+kORug6;p{jhxy*%`C<_mEsuqnofj%UCo;
zpKiF+-xIz^<<{HXSa{sA!lV0k;4$G~k8spU*rO7psGws8_*yWzZvpC02aXQsk1Ui|
z)D!mTU%#y0-K8b$Ob_sg(n?mOo~c6nA-*4C2%m$#0oR5+o}(r8(2;x1y`w5t#{ClV
z$*H6##zvCdu1+J6*LG=k5!Z2S$yt{rYz<XX7w&4E1x}8YjwKE#C1y%_*aRkImUPiW
zMVTxz46A8RRl^89Zfsl(k=DE*fo#~+(w_QaF2Qw2zOr@FddTT@UNM_;ag`}?`Ix?^
z+$af;I>qV5t7p(qVEwfEJ)+n&bS_x^>OC;ASqRZ6H>Ux96|jPL5(sUS{f(qEX)Vy=
zJcstlJpUZc+Ck6<Xr-S>vlop89_Eu=RzrX%C$B_ew7(b~)jjxZ5x-UMI)y%5tvGD7
z5Y@@><-6jrULHl(z8xl#pUytEW8)vc|L5f+lKM8!Iw8Jo_9rn04Ac?tg%-P!XB1><
zp<3nDF;7i#iM$U9;9qxV$LzYn#=zoc-vH0RVlDk1WUcH3W@Wl>AzHf?NBF#sYf9@2
zMR}_xsXMn5I-OVio?dHk{3Mn;188v0(!UL$&hs+QDJr0Vhr>q-5eEh=Y8Wx7(-KZA
zQpF!r)v8Ov>M#XPy!sB8NnmJuw6INZEn3#*`hUu<@Qa4^Zl1Eh8a_~A#SZ0+1Yko+
zTI;76pn9#|)&>JlsQ=N6VVG}S9z`(?JYbc5e@{(*guIv!f(<hQw_sCguLz&F1ND;q
zPyzx!_RLQHXkPL|`8tM;bx1ZCz=_~NIls0|nPJ4FEAi)Uzti+d55sAC1rmRYD}sIv
zzImHlB4cnf<TdfjyzWi13VJ-<vm7@HUvh#zo2ojA3B-nLHOW7F35xZ_dAEY7l=~H7
zF5mLp;ga;XXKL=7Q(Nacnun+B!?u)&iwjVQIICB)LZOV&X*P(;AI;<e2PE!-Ts&>-
z7r}ZIVxDpcO@CtQxPl2pwFVkRKc)GhRGPGsElVTd|FehhPQmUe6c4*fgpmARG$IB~
zaZE!(AN4Ly^*%_f>o1N8l}~*6u6}NQo)EVO%)jG|k&&!A!zq~mAa9;3rKGTxhT!z8
z@-l)P=p(r*<sGVy2V!hkyEekvnV`$eAHnCAR&yC*Sm=TTKrvaX+~vzr%?xvHpjXuy
zOFixipVQRyHE5X)0_!q)`u+(0%f}%2gp(Ph@VaxaaGf!m7$nuSIyD{3jjq}{_vT&w
zt)B1SrSZ^_h;6;(0uLBw4b|p)_2t}@5G`qaw}HcOtg!xswq9*scX050?ADeA;SgLV
z_XpMil^KNFuIfAtnOC1BrhN@`aX0bQ#QYvV+CP$8em$?Vzxr4d9QvdIkAQUycmR(G
zN1M98g0_R#g9!(JE#{B@xoK3e<?8c7d9%eOWgvK{$OXOAeTqz#w#EYXIZx+g;Ihz`
zNl9bK&Nn2xWPRx$$lIyor)#C;@~ef~^Q9>o@&L&oGTwLgxy$j~>}{qQyAIaLad9V~
z9OgvyBLCtj-b4;upS8O#s{Y6Stj>sHavJ}_S;Nw%g|$U-qYh*sdYv4hO$DRV3J6QM
z&~e8~yWFXfASm0D6f%D}X|AP7O>C9sL|<U~8lthkz>{zZbGTIFIzK;;H;aymiHW=V
z<;&dMmpMfSeIfdM_3+gHeC%s$K=h)3zchpu(<60I!gMTPw2FgoK7_-q#&B>kCmc>M
zY<)%7bKw;|QqaMXovsHz>`6QGN56O!#v4ORL4EJE+%|LW!d+m^xIXW2=c43d&a2*o
z*ZrYrA|jm3P7jd7wv|}_UI=k802AJt#m`sTW&ss@q<PZX9bngc){CSLezg(QDH_uO
zZF`g$`Q?>JpL7@3r!}EL2At60h=CG+9KIP3E?aAnhN<Q?i)8Z2%U5Vdl%R<9t_U4g
zQ3FHYBB5}S#P1{84_$)n$Rinx50B18Oy&PNuf@g>v2Ch*Q}^aNR`o|?g$ojXI*axA
zlVoGQT=9dU`bWxTJdqVFS9cy{8S^cK&UX$xDqTCE!v2Fn(!~y<pp3`52O4G!?PC)o
zc0DeQ(`L5{4GeXc@^BF*&s%hoL-s>FzXEW^l)tU|*3~%+aA-Zd8?Q(dLf(>(o4>zK
zsh^siJy?zAwMj5JZgr*6-kIKglK<g%t1?PyK@IRf-R*X9&*XR)V{Z@Sz5x#Infgr2
z5AyUKVNxkiOer%C^dEJRr2nJnyrZf9|2TfFva&L+@vR8C_O(foRjz%x_Q)PLWM*ZQ
zJxaP}m!z(JDO_^RZ#K6p8TX6uEpg+9U&&s-_xBusIXE1j&wIR{&*$R_XcD*3Sl`ih
z?6`nW%;|B;-e%}m=xZ;R(TSg3TSPL{1z{WlpB{CRp7A&KYLH$6%ejiCz{87ja{A1c
zgGJK;E2SoyBlVozs_WfbR4%SxaoxPqteA+%5LrIy;_n9ffp2r+siFLTo=wD*WFTs2
zMkR_lS<iOW3{sGzzoK}EE#Fa<7#b1$4POHY{1K_9*VCoQx3bOXWmN`OwaMG!XQB`=
z_-i&0pTFjR6`4Kfw}qBo*``{E*@iEoB2cKCIPt|}WTu+Y!slxOcI%)XaWLp}e<GA6
zsAhBp)@oc!|8~YmP75v3ZZ08kE|T09=rXNpU0hki6}{y}c$IT2#p1g3`;&Joo=Da*
zJ!)fB*n*1L!mw@`bBijg4LM7IpF4-}ZNmiS;9VN_Xru5NLpfLe@Y<WZazCe#K|+BA
zVa9UNYzU`?DKi(!;ExYXyv^Sz!LAA0S-~EF*~BUXk+S)qK@Q<?mzQ`<?|HvtR$IHn
z?dm_HU9Mfx2kX;IX0Li&8&5zTeI@VS(N4S@E|iYc{UM~N<x-B^gQJh1p1PM=KOy<G
z7$yFSIXR6x{58}zDZ(q{VdUw(Sb;gW%Ei-nC&hD=Z7UZ-fLrW1+;mu_?`?aBmA3JN
zQ|0)ALo{UgxN|0i-eK}Qcu|kSgW$>T&`SS<Q<&vkNnRX`rSQg#^~^OX<AQVYnpG+$
z427N=mpQk!8H@|0vYc`d+;M`kUYrMi7ka7Vcp-e!m%5k>Qr+!mURea<?8}tp3OFE=
z^eReU$T+T~qVlQ`@ru(JCLDsih>Bm)y=8iEbUD6g`FMRGt8y2BK2!Ej#&vog?#W+F
z;}Ybuj0k+7U5b=;p@5A{Bh^7PJ?5hfg#a2~4ro0mXrKM6gyX80OI>WD$yWxAB=-7)
zdyc^Ms{7&IE&~MeD1_{nkCc}1w=rvOsxBV;v<EkmG@6<@+^G$m+nsldQ%*YoDT+?l
zSYR-7)v%y0uZ4EYW_e4!*fBy&2qn3T9Cj~lL=OJw-b#o+yE)jh@RK8r?V<`^-0Tq5
zwzkfv90z(qLbX@0cvXXFXbcONP@2^j6IecUS&=mO10Wffw!&cF)~Dwn`l62b85rP8
zEskOOKt~1$`@oIhu|k2hW`w1Uu0~?Rq|p$Rs%Qm6P1{oqju2pUJ9hbW{{(lro<JEp
zSgx|@+ab<wbe|r|SVn1xWX~L(xVWfKsz}>3Dg8{l1lio&b)gJEJ5>>$FaxgAI+Itl
z6)^_)z;M{CLlrlbt4|xpYmLa?L-qux{jB(TSFB|$NlGt+>ziVB=nu$!<}jFekAke1
z<f|Ra%n*_DXfyMaqRt1uIoY@?3e1Y&gN|vAV>7-zOAi(T%BaXDk-y3FP<myYaI~@d
z1%|9AR=w807pt)K@77aa=3~zJ%2j+QT6Io*Ra@5@Sgj}~?3tXEl#W8dy!)O0(qnh`
zS;d-=Rm`5-n634C2}wTkjs~b5!2dajD^L-9#&hTQp~}ZjuN(UOJFWE38Z~^aV4n3x
z7xg<`Q!yU$DL$uvfQkL-WlvLfLm4MK!ZieJQm@EG==MNwN>20OTnIGLK{uB^!%@xj
z(I#J5eyfqNXja%)XQ=^3U2?}qpN+1Ul`ztFU|$k!#X0kVx%h9MF<GsEzhwn;+Lp7B
z(j;#VPwNM?ofuH-lc|TZbgJ{(Klo#x{@&koVM-In^+?;o6HBDq`Cb{6ErYXkJ`RIi
z!78ili_5ePPMIy>iFqc=A5Y_b@v}=C+`q=Y{tE3@ypniFBk4T`2}0c|?~`G!@`Zbs
z<Kv@Fk9Wq@|0_c3V|bqJX>I%t!$K0U^&F-9Yr(3m)=hs_ebF|4UiF278}LG33O#3d
z{EPaMb7*Kt;VFBjF5fd&)|dww1GerQ6^(GmBO-k{cBCKrGL<K^>xz##3t=K%a|JWG
z@%gO*&iGeuFp+)J`{Mh5-m`M^jVw!PvUSZ%>T4Eay>wyW`u1<2Y$D24m{80^d)BWp
zUHo6A7!<?g4V$CJ-}1_1T4q>zYn+BvjFqbLm?PIdg^h@w^QT)Gf8Vc2GfZwKb&6*x
zT(p|-L&_s+#A+-wUn60@c>K&Dg%3W!6@R!EnEeab)V06pM7h!R0h?u&yU={(bfiqi
zw~eE(+f@ph*K9_$`uZ2!i8Wj{u;`B2NS6m1B4>Z9fbh?mfPfcZGEmmBHh!f?0}9>I
zC+jO;0VHw%y(1p+>eZG>1_Fl9nV1n|Ai)PQ9!0)BM&60jE2NoON~7z|gx$CEL8lu_
z@k?%vknh+Ka%;rA+hGV4Ogk9zl|DX)nsn^}MDci;&ejoaEnGIGMa`{SL~q}qpZ0Mz
z{O01)hFecud&knt@xl=3I5mts>xah&5oa9xNCw4%togN}<k{t4e(YM*xyoCwvz)aO
z!^ruKyGQt$7@@S~wQ}C5tvaIG%xJOdcv5=CNaIY$iFq>5LhXgHg}sL>U5#~lCZbAn
z2em}kBRwYFlxX^I3!6cNS>}+x9f3@`wYi(V9@njCl@2fLZ?jLtShI@8QtP>OWPkZ{
z9E58r{~go3OZI<EcEE*;BCM`@6j4J&tV9*&oVlpjy#{}2v3HJc#iGf&;{V=|NL|8w
zUJGY+H<}vslTIef`J`g2LP{XtpL*hVx-;QQZ?U%iBjQf2Xasjf6Ip3e!$*wACz=_o
zt`B>{wU;$GPVB4iWGg`|VUXNV51--&7-P*a1Z!dr^MBp;VuwZvUau^vTD!09e1y&p
zA-HN%O-7WU2B3~Mt2k674Qo9)Wxgp(<_XEZm3}!ry}B58ovwn1A;Z(OsWE-nez|@c
zp<)^E?wP!_pZK@2ojhxK35!WA5?PHTqgF~PL&(y)l6PPTG}IHjDi~T|&L+_9=5Egm
zDh4lEV|abuF$nlL+uUPo$*_mLT~s$iq2veu*%J_Jo%0K<{)Bz}{p#2^dR1wUjU6#M
z_j8tu4)}aHkFu%KcJmC-gvNHXo?O+P;NeISvTxYu#?-z5X_Z^=e-}>a8RVATLVNj)
zWF$F#s=nj@{p^`!cJX*&7^$ZA%jwq?2xuKVJl>D~ao*L2nx&>^xygi?F3=q+@<dc^
zgOWXk%Zm@=ejN|c)Pq44(mK&-sukUpz&yg&uo?wV9fG^aKkqBzQ74=oz7Xhm*mc<c
zSDJsDQ<}f_jY~CL&k^4~&zuv8B@{ekhqswUB=jUi0TOrc5-3aF-H5%aceRo!ElDT`
zE_$x=Ul}*?vp8duNsFsd5hiw?mDn(t!|w$IOpp3FCFFx{uPpNj_JiyaClN1Vcu`Ep
zSzdEUy!Q!IFPqMubZxBsSH`U9u#hw(c%6zfjP(ju&~k8sU-qpO5=4C{<Nd}PPbXWy
zyxvDrGC?(sGtl?O2H!Vjl^gVU&~?1Jh_#dP%kOiI!p{Ep6JX6z?w~?$vY9$Q-+Ga+
zqB*E70*2%AVSJxWp{@Pw+{>4Ny|kcek`T)Wp~X~t<dmabH_wy4zUC9kQm5eRTHD$0
z`R<onTV)TAHrjN$PC`Ut#xc!fFr|FWR4&QrY3>|)Gj=$2`Rno7>Dg)U_~wJ2<LSou
zgSsvGLfX!p+mXA-xzx{*JKE}^<@;@+#`-$`Il+r@V7DzYz8jNXgbZobyYUnpgPnEr
zPlA>^$24;$)2l&58Ruu<c0{ZhC@l?p0*6$$26=tw^Nfp&Q*pF2lVkIps?PEG*W#s)
zCVJ)Sn0Ptb6jtRt`teG=W%QOUjc@6VRtn--DR5C2Gb2afpi4E)9pf?EN~m)Y)w;xq
z!<hc8zft;Rt_YiT&bxk-*y==>(aIrOX27cQ>Sr81nSt5oW0;5K;<QcXj6Dj40{E-F
zh6e2l1dMa}y>3~HIebKLh{V$<vU<1Pp^-wmuOZT>Jx0>6T+*!ar+288VXO=6azN+6
zag$%cNNU>lln^4~jgen6b}-1e`3@ud9Bxxhs97(`@a_{ooENr9KkR-AXR$kkt#A~1
zcd}eDHzUJVkjPOnCL4&5oN}__;|j17wu8Mq@CC6JnYpYnDlFG*fFIy7Fau_v*lh&{
zxsdobnDdtKln@aFY|-xh@MONYw7u`jom<Wxw^hWrK81d@`QVf%Zehg(e_Rdl;CY($
zP-O`Nl^HA8vc>?Hp$2O&DW(Jr^1aMo=UAzmT;$Hc%2es4{4q0Cm~wIO4i9YZnb!>N
zQ?Zz|59R+yy<#*)ugLodC7#6pZ)&=|^m&NzW3x$=t6z?D)4lzuC!#oBz0vllEi3ee
zyx;8Q7(&d!&lY}jN$x_4*pvZzNfC#43@ncf){7sdM8VGQi|8}u`z|gmyq-zx6D`Cx
zEAsIvUwn&v1I3Bb>!z?X8aXkFOQ(oli$B{=1NDPvzbuw|;tq~ivI<{p1We<Jt*^dW
zF3J2@jgC3nJptzuH|@r`n8M$tA{1IBzFV%w?>mU<hLojZm5s$D=_{<5ovRh$Rwz48
zPEe|Ra#%-E=s?{tqd*m^o-5&Uy+Rd6nJs5D?~*mGN0c^l+<iO*`|^?fiFnIjNK$Ui
z8^guCfn3v+@5)YhI!P=B5~3AFVgwhX@qUBemhcrwuLZr^Y}~gB8uD$Y=oQb47uv2o
z?#X>FcI68fZpLvdF#R2`Q4VX%zT6!TZL5h7RqpTI$Cy-ZvnvVZy)Y+9;KT+|XjR?m
zRxZ8${jbX+A82ESw)A^@d$egSDrk3nQe{0FO>lHVcRJZEK3G^ED+K<lX{;AwkrS-$
zbpSG2r}42#4Pr>1J&9Lm?+ZTLcu=)m!&?Mk{Z&?W-M<%CMTWXLMIC(~u7t9h-pyv}
zg?<*%GtEc@XY6-e4|jGbK%?zg0Nuq8fL?IfGkbnx72q9H(+R4}iYb~TQ95gD3VaZp
zr9;ifU+PfiL^z)6>#&3oYiX3mGWEc1U=sI->H)4zr9WMHZD&ZOrIpl&=6KL*T(=Q}
zx3bu7amByvi2_G^<Or~oZx<?<+NcMPtc13|;q}Dg!yf5Q&lNZ~P|@=KU9PcPo(Q1`
zEY;K|bW}4<!wFwECs2WH>a`f4@O;3UxJLX1ugd)#810JU$iL|(FUlxc8`rf^Uk~qY
zNQ^G=t*o$u8N9oOo+^K9`EGCwgABR{m&8l_S{PJJ;SxVAC`P2F8w>F6*3;k7#8c{e
zPUd@J4}Pz#gdR{5PTIgRFLd_I<YD6<-GzXSE|*V_3}$9A5<JY=+$GICC35*5*?fM4
zdQrlCbzR=?kY1QCu3x$1{$JwKj-zj(j^A<lowu2ImXTf`9I7qdXLqlA=1jnFfGU<f
zIcyr?(&qBIR}q}KFAZ7QPO$lBWGenuACYor@Z4KP#)7T~rtf-oC$-V|7nQD&bF9T9
zj+6*Fcoo#Z_3vJHj&^a+QvTx%$IYhyoHTqDsz%3C0KbfMy5lzhVFjUY14By)1S=t+
zPNJNaShtepUK`>vht&%on;2;)*jAu(02F=W<m4n~VTS_bB?<2>8EU7RBw%-m(c|BJ
z(MV#G;jvdqlPKi!T;eoU(HygZT&Z$KxPwcn69UTOyg$CG=1Chx<E;@*qXJ(Cl{i2Y
zK{iX@6Zh(F;=G5&i2R0{A_I<3sB5vl$R8`|V?Xs!y{3&m&oIw}d;V?pQ9nNu>oY!<
z#|59M<1Ycmek<YZyktWi(SL4q7JHT9J2o|hRKwnNvgx#>Mi>snPjLL29NMcIvd~P<
zW0f*UN%U)xysL!p<*SL>a_x?KpYYF4-&$4MS)}&EZdY@}Gwu^VgjJb6%rLDzNC+PS
zJ%`V^UvJ0|6U)a;XL+;iMfQDnZRNuG)r;KIIaRmVo9Y;G1%cAiSIFYKvQoV^R|bbo
zKlB+*NsPreKz&qv4!ZcgviObP<T2G~96zJGBTFxOuBP1RMTQT&E$K^qA`7(IFK}SN
z+$MWM2$$n2iq}NbT3zsVpvsdhka*=(7@q^L#jrz_Cx)KBycxG|L-`ogwy?C2vmCY2
z2&Ru^RU=31G2NIvy9I>0870ynTwCO5Om(2ev5~hL!zQL~=A;P@(4r*7-$5M`(mh8D
z!Qh2;h3Tg}1-Q^LD^W_m8{SUS@U<$p|1xCs>%NA?Ewr^5%Lr7pHSoMMxe8r$_5*!(
zxC0JB-5!;DI%mhn?q{HT0sgB>A?<SfKHT4h5>r+Qw$?%mjEU0VE(JXb&bQ<+rNOP;
z`O2D!>d9aGYs*KcF{1Qn?8J;c(9pIyxIYnj`*!B2*m^?6%lN3M>e>K;<3Vjq#DfQC
z_jR81{r6wRM=Yr5eZKhAKt8nagJRnD%C}OcZ|^R5?m5;hf4yUn!c&m}zcc*uv(XH@
zK|LggK)!CH@sTdSsln(jnsgepw>~XQ{Rw^$m=(lyh*Tm_`4QxT51Qj4bep1!DiP`b
zs9boK=PV=qQ`@57P}sA@`y2LkSVO}L=<?2-#pGS<(8UpHJH$Ge$Hn(tzr?9?@&}|?
z6$kIB(U7B|vCsAXHb`lt04S#(a~nS6SzDD8KC;8iMZIv$At<+vsfaFxJ~$$;$!c_G
zuI~f=^rVVhGTZCA+S)oTYM#c})RSViXY@=l{+v<*@6M+au)NBO6^##3sC<GC$8*!~
z*+w1(yrr}iJyWDd&O4IqIRn?I#svP(q(?6VQ<*&8uG=nNcZ-G$w<mMy6{R5WstEmY
znFmd>{$&Y=YgI}J#yP#87n=QkzjiQZkNZ5uyQ;}3W@|zUXK^>zG(pl?O&D3j;D$bM
zQV1-i#kMV3&7NL}j{zPcol#s{Tf07L-{lZI*Se`dnOx)2zc)AMXTt8E;pFgLSV@L*
zuK3RBwE%<uyey+qB+;3tPmZ}H15y-4aLqbs>e&^$;o?VNPybQJnlF@klN17Psq=2T
z)(rPF1se@Y-J&_S%G)tE>(wU|3dc)O+Z*5PIozeCrJX;=wHAzk4djNFtZG)h$v&5B
zSWdFWIsJ<Zyu$QGMd<66$=wkRGKq@{ly?Cd9WR3e<7-P9RDArNve@yr7s1&frf;}Y
z-J|w4H@<Fr8XME01Meh#W+l+W)FDEJgICj{p+Kog&=9#P^k%vNt`Fv|T<~OvRUe!5
z)OQV?aJ9mEL3}gZ;Gky3ry`sGvXZh1I|P#W?b-tQGmPQ2P5fw!-`y;2dU=!M%rCjQ
zd|}a}Gqx??b6<%Eh_Pl<&jq>%W;ffOdY;S3v6CXjicJx@EVh%Y1$iq(N7)15vG4km
z!%SJyGq&lWR`$qION{hlM`c6(I|sFwb1nGRnM^Gc{#Mqh_=NMUhiD5@o>o+stIoqi
zrQO#N^3=TTFDikPGlNC$P)JnAf`)IJJ7hqVRNOS-2$N4yS_xXX*#WK?QTxXS+luVz
zLeT68A?^#SBE<uOWvg8EaG|fEg2+2$4ahLIPuqOEb3Sl}rdAcIacP|5D!jniM{!Ho
z*-+@7-`LgZIa+&~aOmDV+h7a$JhYx}*Kv?0?u+}8vm<n+^3fb=8G~pg+jIxn$xwf4
zu%GN`bDXTi-E=c&FV8o`y{P<%yTg3DZ4FXhTkAgjjq^F<0Os@4rG%%0ol&PB>~0R;
zD>c>84Q0$nZfoZ(8<<kJ4GZ9PM+?mY;|O=@H#wJv+*w~ov~D<F+}Qd2Qo(vuGgV4g
z;UN4)W&U*qsW*By)h-br!vZ)L4{yshJ%V~pRX=pOZpUI(m|htz0wx6Kv(E96;ZjUy
z*6`O8m9+tuT!X-Cj%|iF)MCTB{T&VD1djbn*ti37gbZi@wp@)q{1_a6RoNM5Ly2%s
zFUXgZ#v!EA?GZ$0`LJ|TLtB583tja_<k9aHHnDD3g}h*@OmIs>Evprq5ERnFf6v9a
zqNE5ZR!mo6ly*t?BVe(4W-1|Kr0<zxki><~m%|}r>#qan7b5EjPv9i$bhwqfWF$FX
z4}{&I0~w5@8NQ@S;j1=`k`>SjXi)vTYD4X8$iLUAb%<nPO)6GN(cZK8uF!2f!Dz-O
z5P$I^agUGnhNjOWu@Yjde`o<n$63xe;4thDLQUE0cS4Ux)@v-uxGQ2ZE7*Y-WV3TU
z>#Zw#IibA{fdU_I&Y4DYY&R_0M%-XuUJ13GQz?C38HP3T!C{nUG^hr%76#E7j7~;3
zVG`1Y2Geo+yewnYTv<y=#q+HruzVyqz-!A?^WHVB;T@@S`y-Sm%fR~9@)y`dDCAlH
zGV4rD0V=ko4fN$CR*z~1w$Aq)JWV)qJEfla<Y<<fKU^BpD8vp$^RRr|2`ycP3kX`|
zv0oD}*Z)_tH|us;J4A+2oL<m7Nso3zntlL*CNL}RD4GW^o!%onJXrT$krm6H+~1Fa
zRuw~a1#Rp9c+Bx+l~%_V`(eLTDF{^lbZ(nnIq6<lQkP7*tD@yw?4P6r>4(z&W9Ryq
z{}!Wi8W;ZngOZ%+eRY<>P#lrPW_lx4V${P@J$4ouAH=cBBz`Vot<cBk8_7H5!s3i=
ze_|U`#<7cKW)W|q<F|LykM^#3Of?1NWUORdWRQ1Kl2U)(+UZjB-E@FQFl7$vYYLMX
z^a|WOiiZwi#F_)<XEf(qbL^YeiJP_{!UvOdLxGB{NPxdgdwTiE#BKUx41p{GtL~W2
zOuy6<x0#ky42W4LCksJ6lj+p4jEve=x)fah4)JKS!>u!V=MbojV9y_gHO`^ZVZI}2
zDbD4V-OlDDSvK+2&r>@p+0I}QZq%CQq>QaNQ>KI3d~ntyPt%5qJ}-ae^8A?=GOYri
zSo^a!R%lI$m|v*;9-5G__tDNR{{xWvbuL*%peQIpu&}3Dy1P(SU!nCYz5Z<x=N0w$
zCa`EHyKZ8}Q*KqYh{=crMdw0I#l$a=dcvGv%6i<*G>fm{#CqEoK<q0dm%Lo+*=jsd
zu^?%i(G`{HBn-FzdDxi4AzFz|`n$;mPmG+tH&aJkGp|y9E9SGcuoCm|_&EBB=X^NI
z+HP?~MTz`s>viyY1FatZ@^a4`4C`cMM=@0wNK0-UL$KtIj)xQiYDJB<qn*M#`RS=>
zW$W5pgX*9SDll8Klw6@^3R6xBYsJXc>fPE4Y7?e*{t#CUu<U&^7{VGRg@d92Lbn8?
ztRU|2T2;%pU)l+EL2*!A>#wf$nLW<E2$w2MC2(0-B`Mh!I(wKd#!>^>v#Y-^7#N7D
z4Sw)o7UNc6zEhGmFy&opU5$_?y%Ds6ZNxb!@fGeJ+J5-Q?=?i6v>n%NI%#TC)IUVL
zlzQv+sR<4OE-75fSmzLe5jt=bM*UgZNON59<8I@-#OJYw(@knN=ZG~xA+W2)r52OC
ztiQC=ocaGMiLe#MTh)~#V1tQBDK&R74_$m>v6UQ`EbZPaJ=3sZ9Peuad{F4MmCTgp
z%6rpSLHPE@L{Z(`0cZf}bBYtKv&UJ!?^BlNZ8ddxGMA6fN*7=`F4`52d6zuz{XHdT
zBU{NxGMo3C@_5hh1i~4@AN##(EQ_fQxf`HLpkxr~CGG_CY@lRGZDqqnLF<yPqLJ_h
zhMoqL5VLtH1J4T#vKn_YdhQ3^GA#5c4eTIm%ulI89I7&idN{AV>Uj`6pzXx9JCh6D
zvG!1UM>t$cK5jk(uKZXU0U;uoPk1_*9h0w+9G--oyYXjoPY0N_PY!96#RKT_Ckr@l
zAiE=Dl0{}_8k@0A@gsjw+kUWJ=!w0(^y6b|34N(Wm*^@o^E>dB8>~I44kbfZ(mZ`j
zvmckqCO7Hx{CpKRD`9M+CG2pGUi;VT*BZ5jT1uX0*S+THimAfG+ZKrz4k{SvWEdr^
z2#~vYtFfu{qFb-&UlXQ#dsBGD-Z7fV-1?n<&n;Q%!~dMsxRn07B5528G3v*7bDp16
z$$v!|Yz_3y#@8ay%Hzt?|5a#!x*1`9z6kGZ%}f8Cuj8~t*GKKx*_LkIsnlPRF`zmx
z?Lppx={IaKmiEP`ivu=IINO~M?m1nrdj*cFX*yVoYnHtT1}QurB$>-I9Wv)Q84*Zn
zFoB)y)Dt#s7d~-9-Yx{aXq~O6besec9RGT9jV7!3^2s$%xr@|1^#$VPhN?+T+;^Ah
z7F6b|+LH%8`wcW(o5d7ZfOmR&{@l_A?|O)cHCf3yPkOU!y!}mX_1jLD&>wz*#2{66
z2JVOjfJ4rxa>Uz$3Fpvq0Kqj_?Wp4hTs>kTVgp=3Dx>qJ4t8Q(D5pKAZD3750b6I7
zMyW#KV(i(pof{<rW!F$!w^KW2P_+!ldDYj@0;UPJp2q$JM^O)*9O||pBpc4k8^+=u
zudQva(^YC+|Ij<TPTCZH@#D6(#iWXbHHGkEqq3ontMAiOQ0|4X-5}NmY+?1ZOU<{I
z65hKu0zHZ;tVs9Pyqe9O!{99!YjxJ?|B~bI<{3Y19c|9;Qo24ns||Q(-l$4j8hiU(
zk%nEvN33+sIOe={Z3GQ+h77;I_T83l4_dDb^2o#tL#<Iy1U<6J*C7#It_p==<N@e*
zve2h<szPG9EdsL`znsO`yYhy^hh^z>h|!GE9CA=gi@|RudE6atX^AoOJKfXNNzKY?
z%e=osM(nIJtEEzY55=(-I!JA$&-vvZKOw;-q#12tOgzv^g6*8Fs|BuQkc*xhqbCX+
zK^WwWgIn}gr=xt0l?b;De^cQdMFZ==YzJ-EIeWA|`=mxC1{?NmI^XZDI2Dp-)b(Z1
z(&Vhazcm!$PM$bg|N51V4=YdQofKQwM@0u`XTe2l8_<R*9d=JLPyVC*Hy60+aC#Hw
zR~4ku6Y0j?%C#?M$F!*TMl-4*Z~shs)EB7jt<{%N>)}r5mI+O2_ZIg%-rtMcJi+bn
zJ*l~LSE~N(4qfJ?F7E{=?AL)6e!5P8l#*1GMgN%<Gf^3cRqYN9wBDn1Z>7N)C4CHg
z-{jNVs4G3U1@mi&ZGC!?)yO`Iy&ReRCg+vj$$vvh-cHF!P;jnW$)1zK(Q#oMf>aIZ
zWSFXXn;v1a;JZBEG;ZV7asRGGsR>=+(=|ZnfY2Rdxwxp_=L;C*Z<j`C9geG3+#`&W
zIq!CTb0@7i&s8wz{od`DP9D&CPeKlqv}e<;^h`=UW_D<&Q~p;km!4m-drGgpy9TGo
zBW2(#bFPGHY}@_zdpoZsqBH3IvHX>R97|FT+jff=J$N}Wcivo2VU>z|-?OywV19$_
zs6PJnfBVC|Gw<`pR4nF7TK)S(7JCwO%ssgib6Vj8Awt%s?FQ)50+$Lk_Ah&O6Rd2}
zb-!=o7<>QIr*6cmv7hLiPU`%{d44U%6ZFXt1c$g2X~Y>LLf{700wjRK+np_8Xg~<W
zeOe1&C+>dzdXObBt`mR!KTy-h|Bbjd$Hm31>tarcF3LwPqL}ZbtZG6&6)HR}+@99b
zajgP9Q>pPk8ub-wf1I2Ki3}|SovnVX1L?_|ZY?vEZNLa->;D?MaSJZ}^C({Ant?w1
z&=~4-wIw|%=T=VL4#EPE5C6O0#xw_&90v91NIN%DTqxXyt${gefs`lPri=<@Iz1g7
zCU7X)md3&LTf~!@GRA_*bf3F7GyZY`tQxh)2jSof(D&&z(rg}ds)J9V*H^)qarxJK
z*WeeGs*X<7RVb4~>`k}1R&KkvC7{mTOhf{ytY^utTi)hL+OmaCH(>aW<(R>?j(wV2
zXY!G}`IY{J8RKv9Me~*NVCOdEcS<)gb${|Z-_s|D$aP^-Bh50DUmb-ysPvDV--v9R
zP6w51+HMV+)SVb$_&^<DJ}b($GeZ9DO4$ARHD&|%kJJFndo9qi4v9e;3SV8)br2o<
zsn-dg^AjtnG+xjn;f6&knDxKc`nEjldCjY4c9kx8S!0*t`bud~ix#)?VqPmu#G2Wf
zoKme_bIyZZOcmp#pzuTEmukp}A)=eT12MunQDGD=LDhT;U%}>m;dwKpI>&ru^fp=3
zm!--U!F`b{@f^Ayt5*GtO)zj+mUSKT+<eYaX3*09*!>Wu?I&{Hs?FcGQ%_Qj_2t4+
z<J>e=;idJR^lE#56@xdrvB4R`P^amLrM;?2V{$K6S)V)ji06I%mC836{AQ|Fqs7(s
zo`3)S9J31I;8QT~4v5by43FozT;@XWWT}_+UtH=V%59e100qQjLEgaTd!(qIxcg&&
zAjn3St%j5gCbg^NC}^5l%jzx)PMPoXKj<j6zn2zr_-Aby4BU>oUsB;F^-?m=B4q#S
zLpFHJSMo|p()kXDgTA!sh&-~8ryeBbUVSll&fefGyJfnvm3($N1R%0-SFV-^b;u7!
zwO=Cq9^Z7WFgFr@Zju3mIY%MbZn9ljseo|hldeH9ddW@M^v3;vT4cR(|1LSoFtPpn
zKNjU4cFFJZ_hT>fW|6wz!mkuc+A`^~5KUk<Df!>#_404sv5^f_AMSm8>dZ!WtI#yy
z)yV6pMJ5pmeZGk;0qNnaA9v#a)-z0ef#hf7L;nouTDO(K^*7-e%LP^y%_mpl<Z*sz
zy7!jFeX?zxqjobIo`Kmc1Y2%4Z<{&$t59nPw~00I7(@Ji#7-U3g1e)4e)ClveBIci
zjNuy4vy)L<Y=3KHo5nqOK$UrE5<Zg=9S6b)C4$4VNXi9>%aO_GJO7dpD7ejsYnR@t
zY1r7@dO2<O+KMVdb;s1PaIa`8!f@q*j$1UShv*#%{jZVJexBx3;Qd^TMF&zM680V>
zbS&IU2s<z<>I)#uZA8K<C)ZSl5*>qQgr|VBr08ICGYD$ryRrH8Qo6Od*`$!CH?({4
zo+S-<2mx=e(jiz4Nacl3_hY&`&kk-t>^<GO;vCZVI=Yax{=|+Be26spq9V(?>;1VV
zUh+bssSdrPsFkhV>yVXBu&EwpX6R?T%p-Uon){1LiamkDAA@Hd28^#|mBq#Vx|1pw
z5<#j9m}<+e`KZkd_w+d8Tj%i);!AwF-Cyf@=2TNl_ZAN@7a97wK8V~)Z&CaU%$dVT
z+G&b8VHNZwR?XzQEj4ukU5ovSdlcG1oNe`sO73FRbku8sn9!oW7wW;TJ@FA^$gjkP
z90y5&wJWYvhG`bm_z9+rJna0v7%2yNvTTVk>c?hjczf$25hI!~CUP{@g*hqx1rd1S
zu!xC4s5fdwCfb(l;8}3@=yYx@l2w;Q&Q|Y6JuXDVP*m(vePz&K2smiKjb58AOob|O
zJ!_k>TNM1ro-*P~ZwaopE-uq^e%zmgLC2HJsK3Y{QVLhimD04j^X)}<21mh!9}4y8
zFYXT)C-YEIv6h&yyvz0^_b4`vFPjfu?D#;_6wljK^U+I7GQ)YS+8i?w=pRGfmqRO<
zT110bD}aQxUGR`g!So+&O6whu_o3vA&vJKgv-wkeAz@=b?x3&I8&()X(}tD)HwyC+
zGnVJ0Dt#$;b5wX&U@+ilJ$>2%SOaGVXS)f}+w1NB4AJ?O(G<_$o$|Zo&KeC51l9o6
z2*P~OrON<r_baArclnM3ze-l~^I(&ys!cs5XGkt{I+R;|9&g0g2d2xB9iRChdRvE*
z1>c?ngBepso2ZZpQ3&@6=23`G+kC1<j;Ba!TtXY1m5<jPhVB~nVf-zw7d>kDii|Ve
z?z{qH<CC`NQdH*R4w$Nb8R`Dp(tEKwXrqv**&r@(p3B%k)X+x%&!3n+k3S4P{C_q-
z+l=VYU*YfRj1%C>&$BQ=e}vTV8j9mv>?s0WEMLC06}IN^qc17L{c$U4n4Kn@9}-`b
z7$>ZU_Zt0$Q-BSZ{V`+>uUh^6oJ%y7My#oI{IH~#>CORZsi&FT%28sE*47TPl_@^z
ziryw-{`2v^*y=f3{G!XCDM2F;W@0yqLCp*OT%k!wC8?Dzr^f3{;To5q8kIi)m%-@G
zcp=SO=o%Q3zpq_C3G&6eSALzle2Eioh$i(vr(cM0LB;ykxfiW!j$Qh!dG>R*Pm)wP
zX1MgZjN@sV&dHj^+3D8VB~RXKpA^|C=*yvb&mLxiVB8q$aXJv7rFPb~hcKVFeaGv7
zAU5adgd*r#n*+kkNvtPWt!g_dgOK~C6G9!XjbX30500ZWfF}olH!B+c9E4Vy!jto?
zDQbfO=x`^yzE5ssU%(fPB(==f=hS(<h}8xaXZz*6_SE9(ij%9K{6IHf74I9Y<=-k~
z0Po!(wlXmlj;(CWKUmgqM7woHO+<i}F-ID)fktc{9HoN)v#7L0qaAJ34sy5t9k^{~
zy!wrzM8XH%p{voChMpnTY-J$nKz>dX#B|Q*hG~fa7`l~cV+pvQ3}C!-clM(+E<LS|
zU{>afiTbq;q=ZS8<KxqZ##yQ$g{CWr&e`t9d~rKp(9eM@a*)AgOEVcDi`Z1n<|FDb
z)(KsR5o4GM^-l{I53~WUW$<iY&Z(l(-*y2JJ`h@z3eYVFqhrYq_LhTwEXvI@gS^(}
zWYpjf2&6pKy{OVSNiXd~)2QZEX+h7-x0cV8<}gT$NsRR+e`f<Nh|+URjO12_n7fAm
z7S=TH&4@nkPLf*XGwXDReg%?qsOwM0L)+Ejuo1ENi#Zt~yQimggtE$K)#Wq&DQw@J
z5%PI;uq)O6Wdo6NXhY)*%<%tJ958yz!NXvAbQfT3LV8h--?Vx*9Nw^IqKdtE(4{U)
z-GwBf0n=W`RU(|}@7pX}`Z6ai0=zEO@JEd;2}f-SCoj)v2`4)W-RL>#WcYwTHg$U?
zP75TEPUDE|ec@Uc3>bA;-v!Qh903CE0wK_a67g_!Yl#!wbKGIZ0Z&!jJ2CVuf})y^
zrOm8U+T#*Kd0^1jJ&&JjAa*R-P3xBH%S&W!GH_YOEMFsOSs5koL_XYT1WAYRPFc7s
z`xP&RgX8{22y?Zizok06@<RWwwvD>vaW1&13?649FaDv)Ddq9N1<7T$;(rQ|tlv#d
z9cg*EN7eJxcTqh>P{OCTT>mE6%?HtYk8s|F@Og;K2TrC-i5FtGd1B-a3fg9`ZhuVJ
zR>8$d7g#hva#aQ4kQOsAQ&KTwLV!gVSx=$u1gM(-Hw#d?VIEcnlHb#TZ5^Yr9(nW6
zC$t3qxc2Wk5?eMRZ`(9b&@fz3330d4wJQ^T3JPTYGwmZ_WP6|Zo!<jXXD3S?wFked
z+Q6x>dw#2}!-aW!CnhE?j#q0vi>lB%{}84<KB;Zi+qtx*{vxM&Go1LRFA*^vh%SBZ
z>o%S9^W<j2Db_l2F;HFDb7x_pbkFwZPfkvI{;(ha+oZl^u`lQ~o3uywoP9moJ`h>`
z)ViU_LESc%zU6|l8U1?yN`*T#HRSMk<x_-Qz8L)g*j@bZ%~dv_B`-Dj8o8jlGd_YN
zdMP=rZ#kwJKKGsHKY1vir{}qNnO$Sq!|e_Ax`yJavf{&cIwDUeRS0HbXUFaxv&Y}&
zgCYC>+rMLw)(61(PeqLayPRQOTOg~a@?`EkHf)Jg)aY@me@Sgal4gHym>%z^>j<?^
z+7~D^3F>FUU(zHAN`#9Y0WMkj&t6yD-!m`F*hL4W$QNj`0%GZ=?5y5FQ$R8_%4qlS
znSJ~S$&NU7U35U>A1uaJ@eaSjzW(<m52W8Fy(wL_`*5As*nJoylD3tw4<cTgG^X3)
zkGa~>*U``O>W5P<gDCMJ*VQjZ+%tZ@c;E=m8$Jw1-s83NgW<l{^V)mX6Z%!m!eHKI
zz6R5+e=(#*OL`P1TWlMo)YiJ<iB+_o>`{q)L&y6D`JhS8L3>LmCt^_OY6lV>p6y@!
zU}Rb54dE9XS2XeKoooY7PIs=NuHm!(p|#HE&~fZz5m|bBu8maWI?OO#py7j9XhC2D
zuN(+pb~n&+^WMe)9tX443!MDuDB`rF$}*HYCxiLz3I#qh+KlfU*HlQ=e5NYO1Fz6S
zzOf7_@?#cP-H5Dmi#ImF1-BUC{--7kdcte(C_weRAfDot6UO%K+4qa-CZc?;mSegC
z=39F>dJd|VvaJ)e<2XXrj{s>>sG(0Km8(hhiSF>;w}I74{DY$fP!|~lu8k2gmc*?i
z9k#tP4*Lu}TeiQ?B*kTo=A+ACzHWlJsenn;;&<onxSJwRs62Vrnb35Hg{Ip@q7rwn
zSl(4R9L1~CfNLq_H+7NgqVxc3hE(#f_-hm0lth#Jz}x<TK85cs{kIn)rMR?jHmp1_
z19`(&@)Evccac|+R)Z;em3-0+h&cuFtbNtHZ0i<V2ehR!gRm8Wet~Eem1j5-r&L(q
z5}r<SB>7-2lEPYS#~!Lt#Z;E{Ol@@K3@!hp{Z0sec-&O*{8bVWw>3=C8e}E~PI5K3
znWGmtyLkSH^+q;NIKH&FP45^{SNq!(YYj^YMTD`RXk16Xs63Y#B2s{2^q8E2{KRj~
z6%;RZA01F1=ujdSTIE*0b<S^Ywsu_iK)|hFeTw$JCL`3bo#i1;SYc(5PQt-A{Pds(
zwZ1v@hAV!C-0ol2GFbF&aI2o6iBq&#lna2P7k=H>7U9_AJaRjmUb?4F16~U4oAK?g
zX?cR&lsZ&f&{9Iw+5W!Nb5d<Bo$d#rxRt#JNAi)ZHllgcCyMMt%M-DeJ@PInbGtNB
zhz)xWVi{GRJcjdIDRW8XXJo&lebh>J_?%W1Vxx}L3|338;_8cU1G&J17t;Z%Igh_=
zWe-jVM6^2tL{0GAMi8~L$Ccb4_W<Cbe-<~cC!C$UOjl(vwbm%*H1M(%_M{QbVVr#c
zj|d;o*ccw2Ne(t)nq>)W6Xs4A^^iaK^$TA3Mdbp7G<ke52Kqi+fbK7ZN1A&ayzJ99
z(_g05Hh?+=jUCD@`dckCjx^8{i7)-2Jsvh<=wDw_yNhzM%Tt5x)?+?<6GthiPi^Pe
z`3(9OBKJ1bm)<}Vj@N+H?i^b7J8x37uMgC{M72AoBV5!0XGu@gqcVgY=S5v<%seon
zycYWu=qU$LP<L}}=*3~MO;g!Wy-`6cO`5osVwI`s^5-=1Th9Rv0A81$C^*)%t4wSN
zNI}@wz`Z@Q_wpy~{boJBV~VRWvU+k%tce6yjQ?O!Wka;4pdrUaYjXo$y}KTsIoo6h
z-+EX(DP$EVU}>_Fw=k}fkGl<ntzt9Tp#>>|(iyQq^T3b?LNA^RHp~CKG*5GGni1lX
zSyH?uEr|C)GV&sA5Ca$kSHElrbppL=N;x-E>!Wm-!~x{{<AS%BjQ{ZKbR?c-WIEGh
zq;b4=-vWedky03W-Dn#5=q$I@)UV?A%Cjzj!%>Q7g!qr8zB=1FJGtC<Mmu{R>N5&A
zH(=skCjvp;U2<3#o(4sZf~n9%g!SIZ{dQ*^J2%(vnD&LuP|<^dHQSz`6Q0ot1)aAl
z7z6ApjmK?@*Pn-7s`M8bnp{yby~x0djyGmJFK_DXHQN#;UNYD?v#=9-qf7x3_xuO6
zdH67-#*v<#H@@M!&GR>b{-t3P)lBDcK~dLS9p@y(4KrVu1{c~Qla39PswW5U!rEZH
zc|CRrS0RfOu`U_$p(gsf1NDSg!X$>4DDF2O^j-b5^643Ip~kE`72(eD+{iXr35WR)
zau`#7N)vy9t*P|KwQ!Pza8Bb=TuOB+`Xy2?#RzFtDm*m1u{Y-j2_>5?V@&=F-@FJ*
zwMxN2qC~E3xy-l1aAPZ(%z^VW5jft#A`Srdv=CEc5miCqxB=lp=NPn3z3%nThWBa2
z@BdF!(s=O$F%-7aw>;sXZ`0XE*7QKKX5U&>8V3Md8)iP}>!DXVrA=}8A^*3wTf0t>
zQ_i!mTU(cRd(QR}&f37oSo<~GUVHVAS$5-X9fWNkAe-O3snqP%V!{ye-~mM~aKyV~
zzCiE8`0nC$Dw0I{36z^Kve~2-RVU%_3*qB7b(})u3zJW=Lciwb!q>IUFKC!As|2YD
z|0-bRIgPv76LoOBb-D?b6#WD<SU(g2bDYPfLG?z$wcLOy{xUK<!Wz8Bdp#!`;A^b(
zFx0i}<wB@OWz&4n3@z{(q$dyGd~`w<qGH3y7(F4l{N`m`vk^{j!4F?YM8M%8t3a7{
zZFsu&$*mDNMaWq}d|RvOR50LeRwX)tb|87&kK^1>n9Unju>dsw-5@_hH*0|8xq+Xh
z5i>9*oR>rDUY}ylazIxDZQa(9{jnj=u5KELg-0|lkKF5OiJ|1J*6V?Pc4Gb-5~cE3
z)0boNa+57<pp)7bC{&wSRy8&H2^^y*7F9<>#2t!xn8J)-q@>$0|1}d~o$Q9QnQt-K
zzYGx@%=Hi{cHp^MbgSq`p9a?a8U)A#W|Ih<*CXh2y{_374-j?al_qQYZfKZkN_e)+
z@Rt^p@f1`HJHCcTz%WPvN|}^$0Zt8}Yo0mGT0w%NAtjQ<mu>R5h#XdFyp4q>%ztay
z+2tGMhs`(&*5xBjc^^yd(D739(q1XxY$BL?f&K}tw9@7?b15{{Phh%25TOjSjcIT_
zgBu}h4gdb$`8>Ayi?Ub!wLW*q+nMy##P9tQc{5Hh5GfnvU6#Gwk}e)}t|PR=cWlO%
zA$PA~4a`!mk1LC9Ms|#?eBQG&U`lLrfUzu?*%$J-zM7D-gh}U+1-(*3MABfD7`_j6
zg$ief!SN6G?)&0F<H6)|p#nU4sIO2H7!&*ij}cb{<Y*-oy6z6SYru{@-4tqLnMrhQ
z3|{JbG4v}6tipGSS?^2tKT^KxT9DokCb`zq625LI&ONn(@QtJH9wA^&He{SJeWOe2
z&CKZc8Icl|%xb%=yMoQ9H_Re#`KzX+Bi;z+ZFy%8s}v<Al|vY0x`vI4A$<H|Q}*f+
zu$8mm{di$04;(#3f(UuNH)VS#T%_wIV4C&Es!0Dz(-S0^+<RR~l3tAe<sHDBbuN63
zq?^QTHu25+D|Q=L7*CHbrvn+81$mLsuulAa8h-Qa`0EYDpIf_u^BClye>Q9dLxq_x
zYmlGo2`{skL*^DX+y61lnP_VDX2Oh#cr}S{ibW0<=!u@gO(}A)4>O9fdP{;S76GEj
z{#t|E13Xu{Km`|2aQ%)Y`(^}l_A_zIY$2_ypbeC|?N*Ff%#T6^H!a`O6*usauy<W-
zE=V0U&3#K6U_-cSThS0pao=qbFD`IA0OnEe&#_MJ0>5^})-@S%Q!3-Moz;~wyYaGx
zwCKbAUyli`HNyOMi=kGYS8U1YuL8?_UMcIuMxKqio&4@g<Fxsqb24;R&3j4<J{z#_
zm=7cv(e7=WXqY3=v}2JEg+mNtHxqs-rtNPZz4Rp2O;`F;U{*^{2mkFmIf-mGhFOCr
zE3l2Qy%=*@KO3(2OM`<)z24HnxC?0+8W{oPR`Eb_T=>nj$paC6Mo7k{7rpJ2%0?}^
zj`({q_G+x^RSrK(u)~HS21~B>*qE}Y29>DxIyN9G#H?7u<ogs;6^Edp$KXH2Nf{vZ
zT*Vn+YHQ75Im;SP!pQ#34#3^{mw7tf#U~&c^xOz-Fmjz=!?lEZw;nHTVN(f9Q+&GI
zPa5<?OI(H+?&OBu>TeB<VhM&&<4`F0IkA$}V*WEZhQC2%RRv}PcnR7%?!syAHmvK!
zg?nm?2hkj&BuxPi{+{xP)*e@6W<EYSF+WUNQzt{=&jla4ULlld5fu!Mp$w0CGC!%9
z|8^t*Wmry;<j#L+)Gomx?3ocI0wjlOUNDgW@wwK(hh45uhO8ZU=IOYkc%<$RiqcOa
zkM-D>RVrr8<~O%?A&O5t7p7SGs;~GOk2-vzkMyt17qQMW9Bn&zBa67>^%u#<s)Gx1
zhk`<YnQdV0s7d2&1b#1H`!e0-EFtqNEeJ=WPl#J20EL|a4~Lr!E*&INvD7PFLuNP5
zr_VY11_#{76QmH(a<8JHTm+VySWP3w>>kf9oh~f>+!bc=iv<!F_}2K-1M1V4I^&{r
z&F`TCasQeLbG+?kd6kQ-$MA<gG2G~^lU7gr-{!_RIEvV#t8%6Ce;b%rRwT=g4=0xM
za(yS+WNrh{_hwvo-0^-$ba|*iZeW#TUKWm%Aq7F&Mu2b}lIz3Ep_r!aH!nIYfA$^l
zTgzp?th!kP5lw7)7cOH6e%t3r?1y3c&sUIHUm_=D5c)N&Z#||Q7xI$dD~_c;fm$Z%
z`7mq3ED`RN?EMhKOLd6?e9ytiAt{Ml_p4d?_!Z+tN3`IrHyGJ(V0v%4+`Ml}FD<hr
z`0v8F&KbBace*wnEp#`mJBKE0G*asB!8}MXqjTQa$@DxVR4=M2jx3rz^muwk(=K$a
zY66<W$`~Vg;EqBGdp4frvcaugQ0$5CTWTF?ScH5(1SE%RV|Uh90=k3_Zw?Atb;TX%
z3F|!hj}N1|4)yhEBGXy9P!xde_Vnk$*4h8If|t4%?#-Z3*J=6h%KtY;RRGP&cgu+l
zdd$67>ci1FM$UMOm~ZaK`|R*ksh97=X0);pN41BWg}})FZD4tM`RlK&X7ePiQj0|g
zA0JB>%Bv5U=Fg7&oDtSCmVmPvf3mt$M^Q4vcOIoB>~EzF^~Imn<LiQ^S)aKWeJT!R
zH@<aad~zqwzhiSYt8h}o7KIMXm%mXfk*r94RGkg4CNyeSkToxOx`o0W(K+N<tc+WT
z%k-cS{*(RH>wJANSTF;tivG+B#^@2Y;P7BQGWTOAhTzax`T%m5+Zv%~q(11J=FZWq
zC&4D1$2nF~8AL=q$WpKZgL3E~&!|eyL!Iwf8gX$8=!~|0lI~?zSEVZT*pyv8DS;gg
zplqe2@>GkxFBj~bS0C-=zZdys<nnp0-j?4=pn?a$MExCSo1}2r6BfGmk_b&F^w#c!
znRe)7Q|g-;+Xy}oRhFH*fhVXAHrUQVWylw!C&SE1GjqfSV$if2K9Nq}QptQmh|VU{
zD$|txmhf_^zJVwHhkAOErALdwD?BUiG^SKP*Qu-i(|7S2^hl-?ax~i^(nS;WXTI|h
z8uBkgkd?Zv5%A}r071>n`md){U{CD5F2mmf$UG<U!T5p#4-unfJn0G7J3wWWskVF_
z*2??YFFoqFb>@iHP)oW}eTcFdeVcRoczU_+C%qXWvBo<4q0Vq;fI~nM&(JS$(Rn*t
zUpLCotAS*87tI(J%o^21<0U*eFdk}Z`FUM?@Bv`BubRxblV0-Pl)EPJf0uMw{<MeY
zlLm@33rVR2E714y@rC+}lbPr?*9gbDm@Xt@0BS+q9<q3O7|#`WJUN_MsC{ln{I;(c
z4q7N7D2RTzVkk682wZX-f<tK-HH7~=)r{!qTQ2mF`^2R8q5LRuO<*I4q>RlDD;hMo
z8VVTNu=0`F{c=$CL3YrqBf-dauj-RGKb~_}z$L4Yyl9&4{+ZI|;}=P62A_z`i^aN0
zWEev;{XJqqx?T|zi{y>m@Q)Q{9(riGTs<3!i&tfE<+4}q>V~uCtM*EJ|7v3>mr_Ew
zC+bx{W8ruBxpD2YBTJ5{oQ)eKk@tB7KNv}lK%aYRgJjki>XjN9#ssp*aW^m4pmj)J
zf%Rb|8k-u;5#W!U&}egQY@6TtOoiK^aEbDix|x^-ylxG~A6o^G2d&Q-MZOk|ET8=O
z6*f}yJ<r4kOeknX`a~6(l_}UC&6S|80D%3y#l%`&Lu+xo&}Xmy7dMuCdAurYQmskY
z0(RXq{)Fas@@?Kt<VhOmWgTuABYcY-B$q8r&8X0WGwu0A<?eKC;D63f|JPfW!N5J=
z8qV5AbCLJ2y_SL?&e#41VR8^8sVGVSE?x_rcpY~A`KhTXQk({sV+gQT0awm!J{*7b
z(qKsQY-ij-sK_AHmnx^cwRgnzP3L%7=VX7k)M$J2q+@7#S$!$~==h*{JNiBL_Evzj
z{XzW%JEi;0@%qO)fpIsEzWd!k=e|jOJKF&AeRZ{$4sVGTAP3zouTZo;J5utW``<=s
z7OU3V$<VO(VpzS$Q*|%&sjxquh<w=QfVV}VcKQfg=MBOE@?YHF6(SaSUxHe6QJ}<A
zB+UxOR|#o@oZou5QVT^I4`__sstiT<)7@=$0UV|20KVlo?J~Wh4FBf2Db$TL#XncM
z8|~^&78YasNH#yKQJKb9*GFZT;Hh~g3sdwBDNZ(57m(jGpwDWFdAf#E=|YU$PG!ff
zY|f9xli4w>?wgGdncw{;YqfL!Xh3sY{X8FPYmc@@XbPaSTY~86pTPJ>%k7);=~%@w
zRrF)P_^b0l^+2}aq;^`gQ3~Q}{)b-8m1vGJj3J$(J*~5mHxW9FG_SUNrDSB+s)t*R
z_DOwpS&tbeX%N16^_$2CjU2a2I^ixypv3IkN;6j_Kn8U=Lz`?%+HDA`<o=>^%eDq*
zH8`Kxs09DBBJM=1!I(zNNSV9kujVR7z-*&B$A2z0F@xEF`y+`o^AC30I{OI$l!qTw
zfhE=&W*Csj|L9@`uV+YN#<Ej3DUZIEH{fF(2nY1253X|Q93RI+4<~<xZ^Vkd7cEOl
zVt)VKD`&c#1*-eef5y=cc=$o_L&VR`AYf7xRTIkod%Cu6{V+=VbbrzU7#+uI?g-kJ
z^Chxt=OZ!p9%)uutOF!30)Bn&y07H`m+r5ClehC=I~1*~;1&HpN9P{S^!xwu5rw2V
z<`m{oCZ~y*e2N({hnVwnK82x>9Fiz<7Mk-MGv|@R$hjOQr%5tJk{nhfAxXmT{{H-j
z>vA#g-S_M8d_JB&&rZ3%t+X^J$-mv~txco2%Gr76_8I*Pu_Ldo`}=7hw6z=fX7tAP
z%l{2^^TnhmywoID8EDHQHhY_+^0gkuGi4rEP>M><fn?K;7Y!d9AqECK&hCX+T3Jn6
zBdC1O<m<BDVD_dHUn2(8$+im*=`WXUWSj>qb2MbKvVNCevM*$v$Vf`$?CdW|e90m(
zKO^~wFD*`=u!e>Ug{Dh9eW>)riH$(l9DUYAL)uJRSxj>T5Rc!GM+~7}+Xn6a^nDNl
z!bBX@UwUvcWd^RP$%1u^c8Wu5yZs67-q-y4@b?=bCKez+Fx74H67{`z-gSmWiTfxf
zTKF+{CdyF1{^Q+_ZPP&why5C?a{=$f?`6#~Sij&}xg%J4S>1mCkEPe_jAScWm|l%I
zdc?UFZKXH>!6lK*W?SOhH|{-4p_hxV^ewP)KwyspyE$p^JLc-v=u!N_oUqX#NEFch
z3)yR35feKN)Lu-tt*o{h=BXBMFR%1%-qtLl1krO^{Pw#y_9OGC|J$OuW)Z1-5y<kK
zW(2wuNA8XQ8ks`?!3hB5(u$OnZGtOD&~yC?LJAyy(@x{+gTve4q4`it4Ja%R5WNPQ
zBM~dY%CWygI>!F8Mup1h(%Sr1%l+!J!Fj0&`b)Bo-G-bKA_xGJ!pW4SB8uMypV+WG
zChgB0I6U4B6PBk>;(MNv(zWd<u+hjZG-&?D0vOyk_Pev+DMCU)?Q_s~7DC9_zcM;0
zz3;UFs_cU|aZpND^A#7aix96G4>d{NBs-N<ewA%4lYV)ZjEoIpkmot{yV2Dwf3Fm;
zDT6Hipq$f#kGzCa8e|yVJ(wqNRVnqIA(E<QlB#dww?!FYS+OlmToM?b46ijKFc0k)
zJQMdS5ORTh0R(yeB?bjz@qW#Z!}64PI4ac;Ngn$2rLE@WYt#Cd8Ia;7Y@m5OCYsJc
z`S@XE{aJSTwLX(NnU*hoivgn4gSo?H@Sk+lI{LG@IKJ^|fG;8N>aX7UxB5W~jWN%m
z2>#NXKt;Q|jHo;&7IYb(e0Y|gb`-7MMYQm)o21k`+{oK>fV9l0fK{X+KsltACAr2K
zTj@cqr1*DOhl=(-NV|FAV$M!DnUAv%m>ZS_IUgLa{TFu8%}k`;9PVtF$EWR^=h-y3
zlmT^Vo&PvDfAXnU>ujQ&2K5h!uMvOnXOpoDhAlV|5<f(-Z>i_se_388ti`IO+dz{x
zKHnL6>gC3#!+t_-@a%e);NM8ehsM6Aa5qg%2}w&?OIQ}&SJ)B%liHGvG&4yu0QCze
zltSAI+S@)YmABN!HgiQA8RGEAqa)so71x&G)4gvgvc7w0$i!Oxx;$Nn2_r_#&}4y(
zoU3)u`c4sirtyn{$QtNBZ>kdO1!+8PNU_t;R*?j1#9mFefLrpZf|BetENK#t26U?-
zmO5|BY`NtQ&g7x(dqv%;VT&bG!Mp!T$ZK_e1=|N2coK03(EJ0MaJND(-vru%Xeni*
z)?wEEx4`OLpPe*OZYDGc4E+b)SWJU3bu$n-KG{E-y{1NWZvN!24$&HiF-i%StdT2f
zzA;<3uQ*Wy?tHQ+Atq7of=GAyHLD9X!tYkX&vr*3xfTi702&B@7jmO|b0R>I{I};{
z#gT8<9>A~uk7)IURP>mr^3?drj&drZ93+3eqcFc|vc{0?R|Gu^Y)0qqSC|d***55R
z;mFOGYOKH}BWYyk@8pwP72Y%%2BEVR?NO4Zd@E#jKQ0dFHm*<pQUbQ{zRlN@&7k}M
z>PjP8<-7ge4I{-#t;DQ$znSXls0R1H-_}8(j|l$9#t(|lU`tFf@#696{NUyL$>5!v
z^1g+PpnCns+4t?vxrZGc`}zOw3l4q*YnDeHV|8_Sx8za6YP-<i!;!@^-%6NXN(kdG
z<!9DM4C(#&gUoKPvi>v(4IEZ6%eaMQTL^C}C%Us_W0@b1c{-?{f`DiO=#oY%Vug5-
zfAcqoPA4#HfZCOtvtPp$em=!^n$oZ`v%9E)urO8S)U}#=>0emUy>YUEqR3D&nba_a
zyaQ4*D+Co8Q(fI&r`sP^=$agFYo&%*CZsh~*T@_rnXw@%q?ORg=6EuMfs9WW6LgqP
z;f%3Rd?^GWBWAvO7xGost;Zu7G4V9bA&0g7zNdeqK&qnaLj>L6@7bGK<UK7D+C984
zrwbPl1zA}o;BE<@M+#W5=>MiYddMf$<YT`Gcshw|7Wfq5KKZ_Bjd7R4^O6En39r9C
zGdLZ@r0scD>i*%K7mk1br=$rxSozR*uo)VUNGSB>u!wJ1$N8O#%9y=ZqGjwx`+%&0
zDm*lPvf^@_85*32jr|(l_J)UU-c0Xhfij741LGDbISs$`zjKPO%k0I*XqFodVeA~2
zJYUKOhyu2-(nw5%=yIt*Le~57>gsq~>3U9y5dY2mGiEq%vN?R<HQ?s{o$hI*dD8NC
zhBc*LUtsb-`(zL>I(Pra9CQqaS3<ONv(6nmlIOFDGE!%$OsQL~#^-l*bb$LJ<@sz~
zMZnMQt<0lu6-6jc<59HC8MI#t;OH}biCC<WQpUeS+kKov=)0w}g9mhHCJO8N^=WRg
zelzVt!<UMP#n7q@mxciOtCf<6O`lQBvhFJv)QlhLzqC(mPe3`tVOWK%3<oPr=n%l;
z*_dITSr?mg%jX7XxP(9DYIN7l@sL%FPd)$g-MUk;40{~w(zd%4?^pIHN9Dm{!qVkp
z*_#ICM#2b^Z>`?IT3y%dMpx6M;NKCM$5E`w@L-FPE1retj@QkWujdJt5{0MNoKvy}
z-vcSxvYh^^pj0VXXyY$ia>7Cx-|vcm=88jSZF5t*IrCQ7)nBf@Xg?b(P}vnI!pc#~
z<4+|})BsESA9wSstCQ7m-svX+JYq;Ly1shOKSWCmOl&}ZG=H=?KY#d30HW#&qsbWg
z9OLHY^UV5>$%o9>x@M)$3EuOj4XXM6T{EgCB0-u&u0;*D?QeVAH)(Bs3qL`^@Z>k?
zwYO6~b1jsQwd9~JzqXFqp0Mqe+Zwg)ishDNJ4><S{Uau_ml@{XjE_wWAQpY#sB-rD
zpo`#Yb=LZX)_C6=k=WVwmNn4bVc`LyOcMRFEP4R^k9$bTR??%z`*b-Pwohz^+G_|`
z&iXspOsSqya_m%|=_iy2+%~<y;@3=Dy{&;q{!J(!G#A*>U#nXaIEQkyU&Y}!I$z(J
ztpoW9!sa>WM!(mR*KqaIoXT-_g+NVi;vj;zNQ_g8>m5R#Zw=P>>Y(rYi0aFTz*w9O
z5UIKkx```cck8xj;|MAgqu5;?bCt)UnSSNeX1O;9EOdk0`~qKTr`n|$jd2UM^qge1
zFJ3EpIcWe8!C>cJYuB+v=$$aVaE!A}+F7i^P_`8lIBJ0cOq+}sWt9Ft0Ww&#lMLEV
ztPmOn3Z@&Mu)#1{RwcdY${o+9x`?UMU0OHna`-cvucn~~>(<bR=i8ag3z99;bnJ&%
zAu+vm{-p}+I-|&AUY^HKqLouCG~xmT9Xlg_Z!M1ix`S3>oQswd_@#-2(M}dLtTg!d
zQ2L|Fcq{JdFh%X*Qrb(Bd*P7aV(r?heq;zcCC!q>`&}}E^wRIv9g)l2Lrk)c`e8~@
zcIcc^^X$7OVxRXSxTV}2rYFHP5YStM6Ou$%j#FBM*~*xC)9-5DWVXUA04?#l($UD#
z*46sE_~R<aXIm6<>$BHRUp@SjKmTw0^wkKZTf?Oe5c?Uyi8l+VXfupdDW#>lv%xz2
z(eiI+!ljhwExK%=(tD(WQ{eWHIJo0!T`Vs=q~;>~76j0NLJY)TC6c#K_P>??{jI0i
z(iVayPb_7eL@?i2<8v3X8j(i7_$ev3A0*eGp~^Ai#gd7>{D4=cI==%w*L@9pkDIb9
zY<_ty&@d?HB)=>C<(Ad8JqNb5TfvUcWiYqxAS(5Pvz~Rz9zs8qGK$VC^`3H2I{FVG
zq5Se8)~0%i8Kci6d6VSwBD`j;<KBkD^t)JR(+?UI<{&l?>S@z<^YnOoIf9;7AT}8A
zHLe`5>gA*!6wd+nBa9JbOtIR<>;Ta|(Y*G^oxf8~2UAW!O@DNFy<)M%`^JWSGMp+e
zKM}?SZf%CWP4L~U1i1~$r8WZ1)~uL<h4{^a9YlMzg<+fYHxlH%7oNQT?_1MEc+&@4
zw-xB&g81PeV_f_ssk+)cpCF}FcZpkj<d_ly9bHwRH0>f<71d<5zD5`UTc6wnm!bUA
zFs&<BhJEE?WPMM%k@@t{1pmIUil-oy2*h}Cy8KV<HoF6=ehao&IGZU-keE%4eIUGy
z&sTkRD%>Mk=&unc6F=@|ZWtFYB-?(b&W_yC;wR+X=_q4}7d-A}JWVE3&@v2_RPZxO
z?X^KdOI~>b98v>Lv51s0&y<i|KLG!z@NT3{WzN^ybO$0lp6%=ef~7muZAcG3C9k);
z(s5JK5^GEcn9-W{VdTAyycQkuigJah`s|p`RDvc=SmDmGw#fvx8^-;dmOL`Auip`t
z%&UNBk8&dPKm0EMw9BziHM+N!qb3KNAr5Ln#z|f^zscYNOQn+Wp}fTS&fV^<(TFZb
z3mUi~?CUQ{au2t~FIn|pv$C51r1hL$rw&wYL2N*PqcyHx6v#vQm9d=>lx*(Pgw+_I
z)JO%Pr>kI9(c;IqACgGZl;l8wUf&;2HkneT;i5x8z(%F*O}*d7XkkhNh%>!UGBZEd
zaEs9k>xb$lo1c?=`%76nN0-K$k^tvP7gdCxQ5>)xN7FT9{~R6we-M~FBu~I6mo*6}
z`V2A9GtotHlrH3EjK2eWTY08OSm73w+bfo&958fp>z#q<^<*95=f*#T+=~h)>ciV_
z9)-Q`*MoZ`dwS*uj%3G4+e|9X)?<qauE{*dgdlvSKxyl#FsPPvY&eS9ITj=0)+=g6
zo3SOCo3n5K>zF^N^RGEtJo<L@#!u6&nXNu@AAD``=+9u^CImtamn((X7@4v`WOr`f
zP>!xAqtzwkoDJZbZXlg~OQ8R~C6~FmDU3${00clQADV8$9$y!~av!PXDLEQbsRvh>
zkz`FSjt67T?Qct6O`lAHl7m!f_a>N2=)G?X7<M9lwXF8c*9h1d!2v>`3jIZOMnq43
zurWMkYw_k6Rx$bT;ru42@>UKBl62MJ^-(UC*Xr})5J6+-&qEka_54ElPBglA^rXmR
zL$NQV-qOrbk}oVv2~Ee*GEAX=R7o-tEM5O51ScTef@ESayR?((Q02u67VDA!>ilLJ
z=j$(NRRI4WiiOo`8HD70iUo_3&t0csDwQohy2A%R@(+6qom4qIZU)zzxNw2BVsUf5
zBV=6cho0~W2yMT*u1^$;L3nbj4`1+3o=`XbqdHF4d=?Dk|H0Q_KB~?%#&+=6*fsfX
zQ_0mhm!5g6PuUrafOEs*SGtSJ*)0<h7k{e8?NsyL)sVb$CDIT%WpxUYJ;JZ8W<oL>
z6pUNISVnm4?jLwIfs-b0WpT4i4Y$S}Jc}dmXryM3AXyB1NacQ*A)Ltf{Re=(bV_$#
z)$b8G{$UxiCPL=bD-u-BYsQm;8?p%B-M$B@T+7R?@OG68cDU6#Tjw3arWjW;T3Qe^
zyY2`NzT!k+-l*B}%oSfEa-{EO61UJF?uk4rDF5~|^gCb@*kE)N2kzZWo?xTRtC_6b
zPpJfY(on5JNO4_FbzKl<Qg!&-xvcE~VKT+afnp-^Fy|o(t_Sbz>6rh~LT?{g4(xK2
zi5%a<2%kCUDY4NR@9UO`&?6}`+><l69o{HsFz&6_qq-;P523eg?G_2WAWScTZ`ec}
zf*o#jmurqOeyn`5`YNfP6A(PtsH1dYGPguI8c~i_k$Y$TwbTsbJzH13N(9Snh(sS1
z`&y7p(6^)^7evA~I{cRlc#)f0<BJ+<CKl6Hld3diI}rquCpX25ls)rV)&J3+^u`p>
z*7-ycXv<;i%#csWwHnOfS<konQr6Iw{xS%^EIGwD$2~`uSRlwe_q_27L8ch$$*A3>
zbm}9hKudJ*=30wxN}7w{AmH4T_v$r1<rFrn4NRAB5Z!)rdQzhdS_Porl~wY4@iRI0
zTdF3A$KH9as)2tOPAvuCb<fN`H9pE=hZI~KIcr~qO;ko^Uy3)7%cbYp{kPY*f_$vZ
zYnUsKyeY@RQtG6AGfMkW3vC74?CHmOlbQ54;7`T;zw+7I$jzjByUm5$Ah`OV@A}oQ
zwfe5t%n7X9_1-%E!m)`79$^)fyELO6IrSgP5D|PJd{Px_Xsov;oSC-*XLQbP9In<M
zu7W{1;_>3z<o_=8!K;Vg%(!H32G2$;6S?`Gxt2p$1fNcXPm!qsqV-SBG02TR(PF;O
z%L((?uzrEGAUHN4la7wT-MAS<V`25Gl2tJQ>rNsT^{!H8*=jEFq5`GDEKK<7Cu%#$
z)uUPl=VNUNzc_O3@aytk$uKH55jF6(ttmWwEr(V1ntb)Cp((y6Ii6`6E393wI0S|4
zqL&hfZk-Ktxi)mW7&SNw8<mMAO7hLDZ*XjC==q(M1Tr`}@Eyf_<(>N?8S>1Zvh``u
z_RD5ym%ZY&SbU1w?0S1uJ^?%@pW0g1q6}cEi$9fm4(u`3kf0o0HWP(#X<Q|NBj*L7
zR!uSo`tTQ1x4+y5r`$$Cu)p9nr!c=en6{qOV5?iZ`-H|?mu5OlKU(VF^wB>ju8P5<
zod*XGy6^3qMy0_4&5*-|vKp8Oi&<NaXmf0)>)&v)Y79EFFPxbSDCbIyCL-(GWiN*W
zzshR45{8u((2ujh7~Gix=fa1>@>WVV7)WZ)8R5VMRMSVL6(qPH?9gW|+)Kl_{3iWd
z^4C2@+&UeV(3a}piMCd<i-+kuD<YSTKkf6mDWXZTER2snl#J^~w>7ddtXL5Pg3r|S
zC8Xjt8G%}2se@A7HknQOBDeM<zwVohbbRo)smVm#rQk>eMwnP!WRAX~FVK}NgPeIx
ziC=$aEz2MTBD@pcvD$8Jy9Ifp8l0SGkf>h97i;T6=ErWUyu8L2nyG&gNM^<{@Mj89
zjr3X@kuPs|b=h?ImXn}{&nV?vfFPxLz9{{>B{5h(zoLg1rTy-c&8SvtBZ#_->}C%V
zG#A0g3-V`W3<k(CXK?BKTcp`Z@?<r<$O%%!xmisGJJ?D6sMAVyV|yjogd-ETu_$Rn
zWOnU@#s<1y$CzeaN$^tSlXcqr$jFhp2%OKmvht4P%s1K?CQr1LRd9-l|CW|XU{wl&
zt=f&fl0~nXZCF~?F{v;;AwDY#fyTg=9wK-$?Nv^sLKd}d;4hrc_`!z~9yC0jv8}P2
zjZdENS+plnjvsfSo-?Hi-syG6DMGhJp_%q3e*}*9%bHrL8cxWYMRjv3rj~MYu{6L@
zaSt@;N6N$w<Km3!R0o0MakMy7pz1tAsuZE`&2J%(Y+`mDCQL%tfEHjHSWEZ=_81Qi
z82~$0m=hPEC0>s>m-0H|(Ttq4^D|1khkL1KZ2(X(E$=_Y@@X`(_zoYu70U7x@NqBB
zNMa6t00y1Dd~%F`a|ib)Tc|nnhugOp-j1|TXSZLa3+MP<|A7HZ_S*6puoe0FU8wZZ
zw$?o-Z#PxoG-i%?n#cr^8&%&Eml*y<^iN60R;DFE40Wh6uSZG~&cuJ|y;~+CYOzf8
zh^gQckSvufZo<Y#7c^n2pq{LlXRyyQ#C%w`e2pR*%6IV~Ruv5MUR!>)P77TqSAO2x
z^ZIsJd&B~7*b=>E#BV^*t$_cRka@y3(d|#52G?ZghaVnEjDj&z8Q=Bu>?JldJIi8#
zF_;GEvXu-YWFZasj`-DQVl<KluKlhw4r~1cN8?BLA_~uk3B_DF{}ZG!Uj&Sd{x5<j
zm+c|%kxw`hUTe&Q-fwzKp=wo=s+y`hOLpuG4eN3|{p5uAFL814lnLbd<NrJD`kao&
z78gr;xL$ISY{`A9vEHcujHf8ynsDMEr_Z`5uX*Z0s9(qL>7GM9_^*Q}S0e#CI09L3
zjo&EvSiIKmKYN_}dd0;r@0-^ndz51TmfUo8nu3@GdZOJvYM73${`k<f>yYdXy8j&v
z+PD@}iA>S%)3`N21YTRed&U*uTADf&;|*d3FE&-~Z;7thS5|A4ev31IcFDR)9-@+9
zf8%S>mr2~M_H|E-z3Ms<{6|%jnkmo>$0%RENnf*?v}8;!-`4PQ3sMATP#cibdjo&v
z!Cpi=uxMu1o4c^Vmff2{P(jVkuotfdCxY!%bbN<Zw#EM-75WS!z!A&^Q+D_>1Cd3s
ziIyDR?}8@{+)MA=nyao^1M6);Ddl2)ysyJ_l6Od9MS8p;x6ygVh3R(`#N%kAq3bLs
zCK9yx?}0s!)QsRZD4Ous39J<3{>H|P&Bw)>1gxd(00GMZcuO%e_mqx_%LU@yo$uDw
ztzq#0nYyv*NS6aU`;Y43kpf+J6%-@mk}6eoa4&0-l-ZjBFW2DI<uo@_LK6b9c1%(5
z<F_IXO2pk0rEJ8_Fb{nbZ1EQsT;MlfeZ&0MLdQ|SD9sknLgL7Ty3x`HeEKAO3yDqE
zGoVxxt}#j63v5sAlMsl{2%_mdYM}~|jo^rLVP04dBSpd)<9Fm&%(*XAlv;nz%5yg(
zKHb|WFm_%3a`;aCGQ%Awb>lSMGEsJG0y%C>a!+Y@RKrAnw<z;=TOls)MHs?;+-fh*
zu2<h6ua?*4SNjW-ER7fL0Ovr&Yh<SJsB<#6XHNSmBY(e_fnP+(HRT_E+%=gjG`+@t
z>RLb(;$jdD{uqu^6|S9<2sf9*E&8`^e_|KAGVrzQ0gy?IU+wx&>`4$zA#C((jzy{+
zW_)pB<O#T)4;i}vovGBdt>3I)Ng2KH`L_keGMYK#DKnpe6@K>8vN7V5@!W{^^>)1h
zp5;2*-!7F!%_6Q74)$M@Zr)glthI#e4R!V+>$%Cnqh(Om7^SmOyP0?17~O)8o5S~r
zO9@x}=E^hz>{=Z34mMu=!P4U6r5CTmza1^<1=0kD;d<n=!ZvnE#xEh2?Vfs=SljaM
zt4^ty1^*6SPbP^hEO-I7fDBS=aAa9C<tb{p_7vhVRQl(eduv@+LB7c3w}>`i^=R?i
z-M^pg@-<E*1VRnybli90l#y|Rnm_wOv|w{7l#-Rsp-^XT;1!gUD3OIL*Tt`nZ70t#
z^+on%t9RD@&*R(;L>T|X|33J~IQn+AEAqexu-GG!!sLz}fHF`W7Ch^wsU?>4A?Nq2
zE3&HEmKb#G>{t>QBfeW1S$t-#bs~3`f9U;KTYQQozG>EfnYYuvei0mCG~Bbxct7K+
z<fkoET{34P3@o#Bz6mY*0`c;qMj2l&ySd$3b$d8)S6(0F1JX<X3r-kXPV!xzI%^Ig
zx0-5tWt_|Eeqk;*t2p?~d-WVci}U_f&TXxY-Fq9N7ddqwXckliWG%CpBlKJScK_`K
zF~09;t)N);fNQ!$ycx0I0SMD)d6D=r&n|}z*W|%P9sI%f4^8X-HQKR)fb4mh@lhl7
z_OW0Qzh?Tj7O5=19QGw6FkS}@1vHDQ{C2-tLD%Z+&i?L={PqvNmlie3b0FIIJG5m&
ze48KcT=y@V(&YH-v}KQfT!~p}e9-mW&H>szbf(Gx^l`vz<i)663W_(#nxh)PJg^3m
zv7v8shM+h8ysK2MkRLaB_WJ&iCZm)0$2{mrwmtIBf2f=amP&14T|Ad+d6OK>sr5_u
z!AMBfugl2kKsZ8wgSc%9D<=eEV2>>u8|cV5eO;_YPK7rXa^2iz#QS9wraaw@gPRv3
z;)Z@-#V?z}43oDHG(`;BVuu=T4-x1!DRQe3n!LKxcRy(i_<XO}Ue-!}YAvWc3dg(2
z3**EZ%C0kvS-rws5qf&HlnLB>t_Qz!P6mI=Dq^MzB98?Pa~fjZ(CS4QmMV>xZmbco
zfWB)D7abHU#FDh9OhClv`<R>;5C-wF3mZb{R_pGK+%NvEv;I>&V1n5uZO6+Y#w6%5
zBGclEmU?Xe&d9zx?Y|{!uzqps*~7s3W6#MCDn*r@s35o*yh*29<5>wOnMyO*Dp0iv
zZP14pSU6*~yn!xVa{9uUeZ1McyUyCq<NsOt%I233?&2|U1PM$t@56J4UERuV+<b)b
ze&%6)W!{ztOIiwSNM=5HOg8I#-E1;y%j&ZdM*U^c?hTT=kS3zZJ$X3I;fez}1tJs#
zpAo!8u9n-}<~7Hp=W?6#ZL9o#mvNHQX#KpTvn31b@Onq1thPS{Zr3yH<M)A-Bq9Y^
z%g<V9UI({ZH+-Bh=77KnU-r(^<)H7?dE3-j!NUX}+pGAv{URg`29%=wX4{7=G?Fw{
zLC5bmYYZ@i_a>?pM4U-zynC`I^X3EPgz##(Oa&C08KT&)5vx4TD{E5R*?PV8)N=tr
zU<_2|0s;@eR&$QEc5Z$ZeoayF{lnpYWdC~PHoywr-yLxRIRL5gGv&Xntfr^JyDm{h
z?><31UcbA$=5&O^-vjuz&Ic8zd|UICfyCs>a7+E~LLKjvt+QiD?o2@F1Cd{IKWeHq
zi%b#MVl}CJx~lX`!DE~>JM#-=1X@MYoh_5e&>mi=q(3e0gH<V)a79WtQ}UuIh<ne>
zUF7wYh-S8)2XKnt8u4<0Qt$m%>&y5d^T`(Y%}p3gF?xm*3c`OKfQMFPxs?uk7{tXF
z;?VJ-wjhYE!G4g(_;C45Uj)*Lcrh0<3;5Lz%@mrZkumV<KdZ}c|KYR8<5+WLKqJ<U
znowhP4ZW=jd>))q2oo@VYHBF=yV1Z?G|mVRbi*_rFRB1e4{q+FtM|Hb_0p)Bd*f&%
z3*OIa+9GG*ay><n;Kup3?2~_rFoZ?r<a4%&B|UEb%81RbU8<m*-3^C5xomlp=PIjN
zBgeY8s5!&;-f5EF?b-<8is?Y%*Fu+Ag_i!w)2XWF$-M#=+e2WDHOU46hG`-_?X9c<
zwY)A9mHYK|{}YJnbNWnJMM}ERkk&Z-yqi3vAL@!()X2iJwD4BQvfT;5BvP$bVjp_(
zO!or-y^1J;M0ABoDoTF*!kK3@oLExV)jFkW!tuOH*jW(xpG^8R*z=Dkt9Yrp8rsPN
ztNx)GE#ai`Vho3jrKQ4Jl%z-in#>3Fa6xip89Y1w89SWeJ1~NhW}wvri(VhZYh)+$
zFmq~*^G<jr6I4sM6kXz&)z3RK)<b~u=tI(pb7JLJ2L3EeW+u4B_s7J`1YtaK`8m4p
zerqOt{LC-wZ3Wg2BKSLV%>W+{m=U6niB0s^@qyAK;>85X)(bXm!}qdsEMom;ZQYFI
zgfhOSo!M0|e8d0xuNK2BZ1?Y4{qK&mkY9bbH49P)-5#v>5t0q~GZ3^7<=$?|VVwf5
z&WhEtI$qvmqYW_`Nt|DTaL(o@DZcE#a0X^u4%4`+uO!d##C~anMmMDWDO{j)T_CFg
z9q|g0DN$Mns6uQa3w7f+v+gpf4A4%trORkq!%Cl<FN{C<PhqAwZ8Uy)5@P#R#4-m^
zM7v_1yL;=WOuj8#A-Ea;r#%##!10R&t&9~e7IaHQ4kC?%*l;56xVQ_NMxARnKS?%@
z-k=7k3hS~>uUX-{W^NVi()aYDOGdR$cm~#~9nTrEvN8lJ^oETJZ^A^hYh^^Nuw4m4
zup9tAtv;Q9+WmQP?~do=o$#ifdzJZgyJN$@?jHdGEP#UQ5Hdk)v3~uQQ%~gHy1LWR
zuN{znAl9W${|fC><(i%Q4cd-VFKHia#>Lxret(#=mg3F?mw9AZgryf$O|HrD7N6_f
zyHEGzJh)&CtvD#^b(ksQ$+kQBgP%GHugfOGz|W7&-Efba`ATs@=vvui)=N))3)K+D
z%h}R<E0wePbbVLb>+mU$6DqvRGKRbY3}TR4VqCCuZhlr2hiUTkqLzu(vU$?<L+V2B
zC}1H7;&V0wHG2*=3zWqM9+jGLKUJgMGwKJ+Fi?dM8iTpa0PWd|=UhHkcu*)sF}ET>
z4ykU^gsaa17qk+IFc#i~3$}7VGSpun3xH5Kl9@!Lbk}umkMdav$HB5|Aq6489qPtF
z+Sl7iHd0HiE%|(WXO8oYZvXnSoOE-%aOeKf-uC$f3ZY4%?1j4)&mgqYtgMXSD#2y1
zo?^_EGI6&f0q#J{?%R~zkdfn?sW)Di_ExD-Ez>DMkrlLu6NFTKE_J%6_iOhr0#XtX
zm(EyA9e<YlUa;vqn)_CKh;8RHb^Z?3vA~Gyyd)wtU<$CL9z#LDVqUd|8TAQbGqN)D
zc6r9E!APMWP>Mim8Y80#02ZSg%Pt;!)Q}girV`bzAT^NfC$~YS7|YvbJZx5ymbaE1
zE`Etaldv2$RzO~p1iOp}QqaLL(Q_n%-lHf73-)cLfl9MzW^xLw1^@(@bm9X<A5ZzP
zR$2_VUS)cHUC$E5HmUju+o)+JYu_*L!__vIsB2h)!u@#Ar#yU-um0E5Jl)cu$0cV3
zF3O8Hy0X@c{r%x@qdv8Hhgc5pPjelWN%F=;n<d7Wo$Sqnq$CPdY!qF@XZ6VYC{sb*
z`LCwFTe5s|j}OS(+0QEYyI4PbBe0rN#sRSa|51C^PInoM|KiTu(|v55xEs<WM5Dfw
z4H&9W98=t!rfK_eXU2=hwm<GZO1Fs2Tk3dki%)JrCm1viL?r+h^AHSv8}pyZqBdF}
z<Auzj%jV5D3c5OQRt&SMcA8-6riE}iH*k{6)G4Ja51lhRbS<y0CQ@Y1L$Fc$B4qB@
z{Bk7s*UsxI%TCFTK0<xm+|T9oykl$mpi0<Cvr-G5BX$XV%FDEqLgXO01zU<YP{j+Z
zYmN7dVjJkm_mw1DTh?o>WwHrh4*DXy?3Od!Fm6|AL6*aJ9L`t}83_{VCNk<j)|Pn|
zIU#7hr2$!oa#YD9@#_pI%3%kXvP`C|r#3^u(C^|IS2NJG{P=Moy0WG$;ku0XHXL-3
zmvK^LL<$Dd`ZS^=KdNxsY^z3wY5n^%`2>8dO;rbq<$UTs`1V2ke>tW{ePDrb_~2ks
z{O~sL12K$Xf?mqF%Vgp!_!GbyzFK+-UWUxPUepb~moGXc>vx}2t2S4?4uAn-#|^8k
z=&ORfJ|F$mr@OD7eZqkN3=q2KkeM>wzeQxDOP_@l!;1iQnJK5eIz^RkVpZRMByMrE
z<6)Oou<IH`^Jqn0GHzE%x%AZt3aM8`8=l_n>5;5^ZG@GCq-!5bS*;!(9lky2*ZS{m
zpH{kaYNP?@9sO&>_xH%&-5a|*6zdJGC$6r^aoM_XFdJC^eb#Ub)+uUweSDCRF_AK%
zkp30Oq$wS9)N5CtG@+aC*XN(>SMFF26dD}S;#%yKXRWrr+l4bSB(1cl(N}TR`DfIT
zlg%@g<&@ps5bCPn+fI4LWNsM&Ri1OfX!V9$a|%q*55;-q#qFOs#Ac_r@8$gEJDMpT
zs!3g@?cA(}GW?UdfG2gcMa~Ydz7{-OMZh%#-iAtuogO4M(s2?d0PU1Mwr|G7Psau&
z*W%t0)iRA<Nro=)9pg4sWW$N=ZJApD%bgdgnJ!1wNPQM`Wq=Sgp`TEbUBheumzHdd
zW@p;_vY_FK!r~+n>yzB!9w032SFrzS2t|OT+FF9CBI1X+(sDYLO?jVGV2AsZFYsfk
z-~-dKkO1A6suT{jf!7CX!ESNWqvuL71}2jd?})s-!udd7enC4PhKl`qpDn=g<4%vZ
zwq+1pMuu6QqP#M=kBsr3+T|UEu`sd%^03DGxmXSi-$RT7avEC$<$+#!NN6O;#r6*i
zY7fItM~)&JVx=kwLBDBxFE=w^G76#6u}dYEv~O!mzpeUpSXDw=eMOY@rFgWbaLko9
z+>60}tuOd2SSnHRJ3#S2SAh7t{4=o49*g;CxuCWqXo!)?G{Z7_*{|71*8j1&*GNEJ
zAZH4o5i7XWxr6ok?juvV&CSW|$#1V6w_f+QKeP!Ea7{L6t0wgel2rIP(#$ZHvLsJf
zWtLo;K&zIP7Kl}oK*K6M?abunf(0s*N5bx&JbIW6eDoprn$oq5ofnQzdXgz{nNMFK
z*64*UH(2e`2hB-M9}s^tjWA`Bg+~|HTyJySIcuYSCPVvm{X@OTFcvJlO@I$8VLus|
zV201+31D+1r{1T)FH$Xk9%3Gj>dDzvS0O!$P|2HltISdIliHOw0g_PmN_nk+!fPne
z&a6NkCaeLEWsm?G-tS%5v;PxVF=1gI#tKskihmlGT<=H$wqq?~26xH76$@0&C~xCQ
zio>Y4ou{}cUpz=(NTI-D0AQ51eG)Q?5yIP=sQyoG1+YHY;r}f`O?p<z2pOBlN?!0X
zOTWoqHIY`<06=Ss_4&l6xUFTcxUB-42nUSY$isho3nNm7=@!2Ecc<=@1gF(Zw6Evm
zw)++r4;I(UH-7IQY>&K(h3am70*iI0?yvi^_a`eSzvY~4o?E@s9<jH@Y9=^$=fXr?
zxP+z{z#5E=s3bq-$xsV*n1&7Ep3nQ@@Ib)77s2&Py(JynpYyR%BHcRi-WfMPxBMDH
zCw+U5&-ESVL;1HEdfwxL`|S<(aesqE%ZDym3*-GR?(Z!x|6E=GL}ZNVIA%E`6FZ_}
z_iX?DvSnA2-a}}vr7QCr)$yoe%JSz+us61I^?$o5AtCChmkZ0w-!^4PJZ`tUq2|*&
z4$~$cSnMT}V-R48vlwn$o8bVO)nYnWg}1=Ua!Nx+zVxb&nYbCg@#$I-zALJK=AUO*
zcUM>XLstWSXAo1BbjcHFZl4R*GrKGHX1=8mpuLTwwI|f5F`=iPJBL{Qyi)#R_D45>
z+3UFG>BCti6t=&2)tlwoNyXTbt!sA51>zC6FTUG#n6#CAWy%Nx5tXPkglCgo9CRg&
zsgf!T$tkZQpwJjOZQ1(m5QZ!5B;@wuK*;G^ETQOJSNGJ=*}HwWbIZdVKWZt#Wk{@+
z&MC)FmJ|d<C<ZihIPTiUM%6u4uuh%KyhwYn9FPm`o$>6L1C0HV+!!z4Nhfcg#S8b^
z<&%<-7r(|xXYy=km_y5IR&W@rlNx6%Z$T3V-qUR-*oJJEGTai4O+LHV*PF>W`tYcH
zsoA2UF&|pyJeyL%3IDZ=QGhU-g+R0onMQfgom-DEipT)PyUY-`W@w&T*O$aM_tlQ`
z$&)Z)CO7LK(#qx1l<)0a5L3p}pww;L9?J@qi#ziT24BU0eNi{1-s%Ku0C&<TuFQ<I
zWY6JxfBW}X(b11qllg<6eMSHCb$QrfhpVKm;Xl<RNOEbqY@foOWkO$0s=(1M-WojD
z+uVR8U<Bkwez#Y!^T{=m%gC$iYHOR&Ke54*eo3E0N1Ej*^$+IHs^Dkbx1FFWQW*}j
zd`e(qn!2{yQxlugA38RVhOu4~nzm6q-dTr_mbviyv&~DQiuM+aVb7#&Uvv*5n=4{1
zU1eau`&RnSsb!3mB5SmIDKno0$~0Gt$C<5gAz8lh38GXxIwnde5gikL;2A_iTd`F`
z+5|@bojS(QmHeUt^6pPNOxM#doatTMo>Vb~lfrNj2D5W@H+FDq!1S?TiRVV6WTnkd
zt`O~FtT73&`sst7J@p3xfcv{V(l>tfVDw4HX-Dr&BRJ*x)bz}nCuL2T91N$d?dRv;
z=YYi#g2Iuaj$9|mnSS}Wg+PFtr(~JcToQh+NK+V*^a%VECeGptN7lQwQCZ5Nqx!$7
zs%C)y@86uq*bCK?EVcC6{GpP1u5#=yUR!o{AS^62RI_m0pwatpIT*l9s)}(9x`^7s
z7Cqz8a6K5B6GZ#s*n48OHwl4Y+l0wA5|AbaEoyMUn=Z8(Ka8P=Ke(5VXNn3C?8_7d
z5f?J>u-=Q~7N!Z4>R8hjgx~B6L0n91Ff51BMt4o-WCB%;a*Lk@CIvQ@^aPkD<zp&Y
zU8_)71vdV!U72aBm$U^Yc*H%~zNc&d$LucktJSP)6=OsLc{RnNisUUX@nT>K*Jh!E
z#*?wV!WI|gN0QNgvutp?pPuAXSHO0H%z2BYe`Fb3QfqUN4?IsD9d+xj$v`l7TWeXE
zL`d;V^UAFtNJ+BuU<^^l2!>_nu9K9kosTUhw;ZlH-8@$RDFPVB=j;DY_4V~Nj{d(_
zsTIVe15Y{$93=6tFBkRp94^4eDM^yi@x=VzD>VQ(w>lWbSqCGuH@*p^p<b7J3(bi<
zybj-@CN-R_MSiIt$Li0jCVz+s3lkdjUP20T@MQU=$~e}{5|duw(haNdMcG)48p0G_
zF?z?D?7=#8X%gzzHJopuXosmvfs>TWJngbvUOYz#r8m<Xt;v~m4%V>|>6q8KDoVL8
z)@2s%OEk!m{g|HlDcA<?d^*wnnpIL_QbBHspwh>_=c$5$y0F0qylVYs1@_?WJXx%z
zE4ojjQ>KG7ZVq{9uD!$uelnRXqpWsx``rjo!|h=?PG{DY+@SEd4A~6rdQMtbCWX|D
ztscbfiVF3ObA~u4yRY2RE|!L;eGTpnBR$wu5{@dvV&CvzHQ&11u@4xuZX)}OU^(;e
z&qdyeP_dWS{(~V{cUKFnr=7!5D0Wn$XdpcId@F!aR)F_E2pa9%eCcp6b;34ic7xuL
zlGg7cjOCCa5ik<ZoHvIr{lf?=ke;hPdyW)El*d49s>C3juKzw<v%~11E*<fhlYeD?
z4LIP06kA6|B!uQx8D#KDCXL>k5@dFK)ol}^*-DEqqoiZQH;~qG7dk#$4E@tLJQsdD
zokU19Z;E?dZ*GT)=169r7G`oaZjgoxdFvTCiBR$4-pMPaTx^2@mbd~XwSH)4QC2R*
z);CC_(39~fsm#m(i}i8Crn1@GHKl>Mw|o;|Wi7I4gJr-Zcz8BLo%Iqj5>1NZB}iXq
zE@d-Qx;r~HrtAKi;tMXCPHH7fiHqWD!t$IPxt5nXJ}?*anRlZ;5wHj}Y{`d0pUEaX
zEQ;GklH_B)UlN&pkHwg>i(DMrUbyWwbGK>k@XzA7U(d|OZ+Ip%vR{-(bzSaU&dM`{
z$$UKqs2MC2w^g&SEwc(v&TxIkql|yE@R2XpHI&nbpjVc@-7&YE(Y5UhRV3*%LX&P(
zt>DHP+gepQuAJ!x7(MJof^3}V$1lL)Wr{3!Zx$<WDAVEuf6}6A3N~roC+NWgB?BVu
z49o!I;wKdw|BBaj_U@#I^h?VO5ydP0wkqlGeJ?Syv-^x;;MSZz3W14w-Ki@La-{q$
zy>|(>lz<rJ^~00!p|)5V!4~gkYzyMy(wBdY@(55S<@5qRgrdJF706fU9dp)f0RIQ~
z;IW@^GI@$*i0a1Xxu#A~{EVbAaGQn?s#&-X_A=6pGdh!(c0peB{@DJ$`mN6G(@x!5
z|6Nts7Im}4%zXl$+oHn4&RvAe;dQnv_A-7#5Z7Nw_4ulHz12`U1q{u1P2Bdo^Qqp2
zH#h2i41cpR>{r49X(G>BXU8bqL!E|lBPxS}L8e#!;A#lNZOh-jRYj!qW@N-S)9=s2
z$r!~uJF9H7s()r~u))$KsOWgiw;JRLX31~U)!AK|{60TnXz_>VOY6LgQ3{!cN6Dww
z+QE9CI{~USsLdd<1}f$y!l4aaX^tLzR%{=dy9fcD$=fQ|7ncgI6?fw|r+&t<BsL@Z
z#Zk%JjyhZv&{2^8^0Ox*xPaDADw_<ZDStE7vX-~H{}3|XCXT0Jt8X3)o&3JLU*5H9
z@1p=vaCPkgg|dOigxw%fVEa&Rh+NJ~p<~%z4zHS-e$A34Sk8Ev%2f&ZR88kNq2TEI
z%jY&>wfxt_%K)!tZha$ix1$9QcCEiAuZ!oVKaRKgDgK}VTa1FUSK%g`sf`5j^fIWP
z&X6Lo))y3h0YH(3q6gq!CpM-@GSJ|N{?ya9rg?ck)=h6uWr$>^kO7j3qoq$4z5mSb
zCSRX;R%}`4RnE!N9Npo<j$<4TB?m-{C1z(nXZntx4ONU)7ybB@N`Pp`q$N*a0;qzI
z#us+J91j1ye~KHzwUC0?ip+G{2EsG~2If4}mX)lPq;A9Om7LC!0G+8*XiJmOL2)eH
zPgd*g!ANfMCq}c_7OxN=oz}W{PN&*xyRlgs(1XWK)R{I9?g%l$?ATtbwxbM@K%AAV
zHq-do96q;mpJv6L5U(_@RCI8YaaQt<LkDgL=r#qZ0h(gmRiM6Hi#ISdL=iG{<Fj2t
zBQ*ifh$BWFw4EWX`Xb{0_9DW<!)VA5hgQd1@!DzHl{5v!fCrB1tuF~Ld%^_m+9e6h
zHt)|_!&hex=i)-vtd4&ETwWZ!z5z0Zrs^Nu@8-OE_-os<2|#}@Kt6SN-$m5eJ9o`C
zjsDqN{w(k$=AgQje#xY-`5h_Cq4irwq2D$}4j)_<Ou@x|J<|$xRTSc;U`Y*&s+)2R
zOL?6=#BqDeI-9kQjvLlf;;DnU#9fyFBXVX~2xCFT<Y_~>xyf`>kN`b@XMIe8M|0!j
zmzkCcTpW~m(qUTN#7*R)Bdub=OwMak7EgJL!33QOD64Z)ODSL=NuN6>F&r$YG}Lck
zD)a;?w{^G@4?~C*{sz9w{_)5j8ynUD<}tsst))a`VA<fys!EE>@fRHw&!DBtta{pe
zdMF_H!qKs?sIX{H*ctwmkbXuiMXeU$<_f8>&CE#oG*ixvy>b=o9Z?y-JLWn<ZUOn*
zxr$%Q+e(0$%*!KNMaY0GXFGu)+!ym3`Se253XcHepKd*trn$G+JP8`Dd=MS!*c?W#
z&^*aqY<^(6;FnEdIBhSmtsiQrd|Hg@cHtRQN7ksiwT&Nikk;AvayXb|NsEP_NB3T$
zSiiwZ!cGX<xh2R_CCaK<so!{X(dRPF;QV2CERFP}Q%TdfZO-y{%rAd#;Y(HN!;arS
zAJe6F)VXQ@v>D8laCFTiU^w9wC~1P(_YI#Lv-Q@s_<HP%uc;M}oLm)1ewRzE>Zame
zd-`l%z0VbYr1=p)qkq9K*$^pHl;yE7p5l~9ym*2;{z95k1>wC{kp<1hBy>Vw1dnuy
zTK2^ZL79z^&Dh!;5TJ|&j`fZ@Yc?Q~Wnn=X7O(!xu&6U55!jaQ$0O_&x+LQBKVxIk
zunrgf<6q-y%9gZ(q-Qqn_8d;_?yvuzR040|Z(#e8Nw@i9;WE*u7O?4a`n<Hx?SbxN
zFu5vBG(pnpHtl~UyqLJyi$|OYp5nB?rVm~u8e@@1YvbRGl5n--OW3qZgG)Bcc$`CD
zqovLrwb_SI>7XH*Y!-GoC{KyOX1f|0V!WKNIc$5OUg*@Qv1fpEh)jeZ%f^TEP9<^~
zRNyx>A|nEWUq5vks-+TCB+Eepv-EQ`RgRf`=!XDH8b*f<*DJKz@*(5%@WYoyFe()x
zPHIvpXgYLj6*q~S#x`WR0Oth;11l5AJDuIpF*|?wqZNhO(2@*4!=XJ(4=7a1Ho6f`
ziVcd)y+7LwIG28dH!s|hv@lJkp?k}{AwKgvbl}8%JR;~egj^zAnq7}<wZdI)C0@^V
z5d_N8zCMepo$<(>gSe1eA4q~0l@O{Rz}f)-pL&X2y)~Ia&m#ueO*68y7x((zia=kO
zj5X|$A6FVT!B-5h%RqdpZVOfHXdSKYUro9K&W=;rQtSjwKJ_ud{^u(sDY;=xRUyUi
z#%w7crp9XiBy5JNy4xBrOHrYg6Pz3!X^fGP1rlm%=KHj|3<LVia@cD!c)d}cIun*>
zk%;hMAjK!<K7NGs?ghF$YO!7oCYbN_@3p@<9z_5*H~oFN+I+<J*!tMm_Vz_3F<TAI
z%QhC_VPS2_eRq&gCOzx~{JyUr%&pJm6Gs;NvKL3P$2<=}lmQ^42a3`964*X{{3HvX
zX+coSEZwKMIq(XzzoF)EZ!#QAYzmxmwxr6uYWY#4{=g+;oC826CX-x~g2Zb8a&|8$
zX!b7IT8`y`(DA5h!jFA^?mz_XY?-ySoE--Kp^Tu&Fq*`Xt%myRuM^Kn&aAhBkDeo^
z$a%`#;^bmc<R1Ww{vUwA9F`r@MyOygviks%vDW%R9d8Fny1mSRmI%bdV{ea{W#M?^
z{*xpT*sJL{e4m;J+{^Dmj4|AlH^e5lgi6WU-b1nA;BY-fJLvlF*&QEUX^0tqD)>K0
z_(t1hbBv)jm#G{n16`x`F2l8s_3?J>bsw@NO(^<4k>7Syn+2X}Uq%1|;OaM}sAaeR
zNWU_xEds+?GoE^<_{qaIi%+MsybC;aiq_{LSfRs~fmyaM*v5{utou%)V$&ocziKjr
zV2q&DHV|aWTQPLo)v0n~>~O}27X!~EVBz}gO50z$LTF465#<~ESPW-6OG9XWTJyxK
zxDVrq{v6jKV*Q1^IL)H#AHA1|ZOn}JYw`HH<KhVzL$~CN3<suv8@UENrQ<Tw%<(Ab
zN=BMlnllHVI}3|sBmJe?=Y&w|S6yZ7_q!b-Ak+cm$>D&e<-xyy-z<K1BdiU~jZWEU
zI*(DZY^sh)%2g38krl-EPs}l7Om%IZ{acMLK$s;2)n$Gw%6!Adje23PlIY`ijKUmk
z$@6D;VS|SoO2Rf;TR3f3HbS8&#aIJ2IG%;4&C>Ca4#sZi!SZiwQ}sF<Hr{b<i^Nra
z-GMS$ZNB~~(F*11>6LaWaSPiF4!9C`j2y#0Fcm&PB&3b9rlBOwISUWo<t+Hzu4GLQ
z4D-NbGLs^^8hltKA_hjEr((L>f3Wu17_gC~uX|U$SzlLxv2w*HE)9(*ozJM*(VP~w
z@VyfhbZ_x~@}?~5ZFy6Q9uwp%G3Zl;eKD$MH9f|XnVTA50GEpL%iC$cHQTbL36Ir&
z@83#q%i(e5kMX@dgnU|@KRcVBpMP4sdju&0+?TKyVY#Q3l%8C_PK&c?6?=yxq}`wP
zZ&@Qg4Vt)IR42{q{VTI}Sf`Hk&JyH#T=Q&3GfWxV>tO7e4`N3SRu&dE?}Woj==rA}
zl}{f4!8n7cYoM;KzNn!%(&$L8MNF#l@KJnzcXXxkn6bf|trs;S!a`MX<knfNFHH~1
zURvHjovGWH&IdBO#m&%U_zgUn()m&UWq_z#d`m<A*kiJ(7RLo+Z*vZIZi#{KAJyjZ
z!sl5G;A9(oq1-Wmg+yR7?Jc;_#hD*{{WDuS4(|5M&VT&a85Xo(arH1d0Rj)IR?<;`
zGq6kv5MVqn|M32CwkZ~aSK#JJ$6~*=IH^r4Kew`zo4QkA*}i72_16ZoF<b7m^XeXa
z$+Kg<U6G+>3J|(^R9N{7Z_P`@#zozqiP}(0uUWcapB$yddyW4@`bwBi@2>bK0C5XB
z@cB<^y8<M|-T!A9=s#<U5pt}0@%ooUR6t&!V#l1x{4VXDBLtYKwK-gp_0b#mhhu$m
z83TJ}!~E}u)e-ac_EOvLR2iI6bYV|94tQR<9*b!kCyam#Ol>OzFvjEs=`;E=4WgL?
zCqc;A@4_Zx8hZ^@5N|?$IcJM8XWPMbJ1-=g&|J^)NozY}xS2rk7vA(Di#AFQq=B7j
zg+w82BFB!F;&f%986mU^=m&iVraGzJy6G8-t@j*E&)0A@e)BtV*^lxIPK#Z@kn^Z3
z#B7f`ia|*z$M7(X$XleBd!oWD<&wiRI}a(Vdjkpm5!lA2FXh-2agrG$b5udwsa}-$
zVtp!11ZTv=jttr?!Yklzpn-A-%d((8A2E4%%tMqIva>qO8kKi){-R99Xb+bsQ&5}&
z2{ZcjAr23CRriDL?Jwlt^8bAg^>1_V>ixri0P#-9!tf@Uy+1ANDkEnWc-#=p(mCgj
zxxZ}x3YSp1HsdX-Q~}c%KV+q^F)ZUo2tnZIw6%SZd-g$`ljV+r9`pi_Bsb>n${?N%
z8kkQVk&Uj0ovRP$JABz}xs#cgT%WgN6L&TXi{i9-e|q<zo?gpwG%T{DR}?y?@5<yp
zze}PIDA;1+OP1{M?5{-J*pWPiW!TV^wN8|A{ff>fQ4T%``^Bp)Sh;>%Ockl|Ifx^S
zvcW67%q(P&N6H)y2!n@vQiCd3usxg;X7|`70~K6jh0m|^zP{{=oBi!D{~J4G|Mp8(
zfCL2P-Xw6MZgPG7@bLHVP|f4wclQr~=lOl*tZmc$1}3_1b16$t<dj+0=~8zc_mpLi
z(Ajt1Vin$ruP&>DomZ;c&q9MAyO&9KXoei=7GJtu9J2zns5u&cb&On6bo$|FzJ8Dn
z?6Y4O=lQUIWZ}>D^{YoaJ^M91|3-SWnRg_A%zu0UUIIr)$H>3mq_{`7E%EE^pgyxX
z1qbu>fwtxR)3?-XQoMar;pcxmeFkmZ2q#&=cb1p+5c4r!dF44gJq_UQbC^w?{GP?i
z3b<a)r`}alz8pn+i?Rm*vbSJiQ4x0dUm5HR2qc*IJvr~!7P+@o?D+I@8Np$C=e`o@
zVWPt<K)Qin%)4;hVcIg|6Jv02kT@B4`qp$o*q>z=!E+lAflw;t<+!0df9?9?Z=hZ=
z(lf^cE8JNAez`a5C-MxLk|v0nm?<NGPG{t~hEl9ghc&)0FSrEF$>jqArv%pp7{u*Y
zya|z%VdP-W;{{i;$7yKK8AOcWQ|oHF=6Dm~ju36)8r-X6E^>EnEV5^Q?z;u%=vUl>
z|KsS~<C$*%KR%*H%7~IwwkUI+FS3#}ax*!7oR>q9iZw!F<&-9eDAt@Ml{t^ZoDY>#
z&Uek>u7w<z#IT%l`d$6{%U?YlwtcSa{eHck&&{<N|5#8*K6wWfi+Qg0P$Gi+KcvS{
z0467eb0h1Wx<QoaKNq3XM#AL7k3fm(<R08)_eK%~Pa$8;vw>%|`bB@7_?^kKUspiU
zG85iNq^3B+zX1zLRn_b=m?&J8svZY^cmAj=n7YcrpQR%g4!hHUF@P`p%iK4pbcl$Q
zTUD*U@LlW`jrvTxMyhI%z;MS$z+wI!cs#7$Nwuq9>b0YH`?ic-N!i;N{}V}e2_>_l
z=*Y6R%6;Nf(BHgkYb<F<lT<$JZP7ll4iwrT2b+w&AWSg?{m>>hkx>u0kg7cXXoAv(
zYZ<xkcZ<<!&TP-u#DbKu591cFo44GHXKJ5B{}-RloSyynYzlO;5PSoTevi{jF6tl0
z)$D#~Wlom)2Q6>yTE%AcFKO(D=|>2_SH$@A3t~pm$J#=jMM;AhJ$IWwWO_JOvghim
zd{)`cO@s>!*T&%WwHcmnPraf9u9h5%l>wQ2IUy_JMSh7_{iwu1rZ@~QXk35?+a&yf
z7*Cm$R=$J;7#wTw{e)NmL6=O>4Wpwp2zNVNAqfDmsGDnI;dgp{9rlVIZbB{9*cFzd
zf=(W8o;7PF+`XP#M$1m_G?Nq(!yybK4A?HGqM|=m)(IzhR2RQVF%-(4tWbW8c;zXr
zVr!Ci0+G^cf(Ada*jkDrEyyga04`f%cXOP3P;8%=f@8^7@oh&9A@I4I1HFZoV!!3v
z8Y?r`Xp{u`6sd=UwU-%Q3}g<}34$+^^#&4~WtVtPUe2pZ9(QhJIGM)%SMhU$+objN
zjjFpZOJ^UZbz`GywQ3cR?IzUY(5)V6L<Xjv!^xw0^+w7EE9<J%WP;FFs)J$kg-OeD
z{bH@d?!dYN)w=pLZmpcf{n_l|SiA`B#gy0+iANX#!q^sswcc|zW<-Bylx6nHFepzC
z5ZXPxZ<UXZ-U@VtTe->P`DStI*zEHg+jXF<3##@Tyz}-(J3~8vcD5$Y?<_uj(IgvF
z<uigW0BH#wr#7cfo%p^!3rpgSW$*F}$V`8h`cbf8Q_t*65x!zdR9SXzk3}ET5S>{2
zy$0Yt#lV8Wz?-+F_T0^I139Q(vRpS>&AWhEF}RS~LMULGW1SpJqUGpiC(U}#9*5Wa
z!<eqigFiA&<MjT(=-#I@ay|PvoaXnE2s(A|RB)q=s2bZWFMek4AA}bg`iK~w@xu%%
z6nwQET`fvsy-~JiWHTL;Td_}rZ0{Si5nht&a_8LeJY8@;>j3umyl<s;r#CX}im2Th
zYObN{$|Y8ZKE|}k*q7;~0M8V2u5jpR(UEVzzU*odSeFvNj~Q(W0S~8D5)1t1&u=!L
z&us6~xrBY79gFV3!Jl0H_0?R~vqMx@(F33MVcbF9Dqs91gg2F^D;8w7`@>(()k5#0
z;0`XYkH3pb@Z&eMn@7_f={@(p7Y$2>2U6tn2i+Lk!0Q%R$3QHt&n&2&-&6;VrP836
zAGB|v6a|Bb?~?xMC1>>+_G{Xq<td<)EwPqUwDaudF_?ZhBIj{zX^_}K;-r%c>$&j}
zGsEBz@V>?{`<?!$mheMzf&#}Y+QDu={T#p`%?)_oSINk(U)szg<B^O%19Nk8-sOv3
zACOBNUX5qKXz9qeQE;A0riQJJS+uiE&>j`jCv79Hbzsgmx<{);0rZVH$gEkYV%t8>
z``uku0QzySq|$$8h!N7Rx=vFxf`E|3N5oaWSA*$(2aVdKte>FUiqtbVZhj0`pcqnS
ze~Jl;3&~fp=YXSSaCV;)+U6pWwLE;U;S*l(d=p?Ff|OM`9<w`4`iZEhs6}62M9cPc
z<xCyn%bk$ry`}|HmS|&?Z9oHcO2<6x4+}6r&<vyz*F8EdAcy+z#@fOZCP-8P=xvhh
zj5l+poc3}(ge(W4`qiNLYP(WV$94N78l(FnS#L2Z;|A3~hxaORt%N(ym><K-p;mJH
zhq7_il~?S8JR;NmcEg&5QRe%1Nwpz4-6d*t+b39?@03%FQis`(pUB^C8e9xlgu(nK
zv0jPFjsM_q;f3)KA=z&VbDtX$U32a5!%qa$=U~6?%(e|jfHOOn%Z;S*G+sP40PpLi
zMOGdNq-s_Qy+1IJTQ>2Q@?GBTfrgX0+0b9u#rpK$!_aA%R<2Gj2tlHg$NUz1In<ad
zQfe3hn8P<G|B`UJpTRySBUv|gB@B4HRa1~Hx8gtHbJLTkX722gPi;9);BgN7e&>wE
z1Q;mgVuJmc{Zh}$zyh_1@mAuH8%r}}l--u=bg87~9gy|;n>YjhaFM4=C^&(QZmkT3
zE!&5A7P}-bp5M-E+Q~b=38o3g3oAW9=spal?RigMWcLF!!Ti08<9=AR$2)7h8xo>S
zn#UsNs(uc2R>w?cw+cNDGwl>Ua|ZjuJdakcUCU%Zu>xp%P>h}$?`^g4zYcYdJ`TQr
zhNDEAHhNx!{Q12;Gt=>7KCo#kp7aZp)4}k@Ix@2e)8;U|8q(`4-F}K2X#@Ra%}gfO
z4Bk6)Zs#{I&*`&wV0jZckFhZF91M5HYM4ED%Sq`^jB_L!T)(}|q1A=ThkTz>5J%J5
zl|Z$Kz0v!Yzj$PW8ap2^0nLIx5=Vae;~s<$jiJpvsMX1u;s2iI{I>fsFDo*bpfnH=
zGW2y2kEDZvdbj_=1JT)@x0P+*egS_YjCQp;cym3Ew=M#k^OC20?raR%xGuGOKE|}b
zX*gxbul$2nf`zb2Y)~AZWbgurY+p^k!O7H4r=)_e)xnoED&apPad<KAW9i$y*Y)4f
zg<!;v8XC<FZiSAdwEf6nXZT7r>Tdegh%|fPT1qbahIEi*o|>j=*$SUtCB*?8GIvY2
zkNJV9D#hRLnI2QJDHmiGVkn0;fkzPEqO5V>m$Odf1Dw5io`0uj9Y|#;(XX%w_%y94
zSdxHmorwV6#BkEs0xgHQ?Y$e744JB6LFi`M_g?6|d3I<M``79b_e;W7CdxHnMh<k(
zbegN)IlbU@jVC|?=d>t`6pB?9Jnk0vID=1i4G`d39`E!~X40=)`rW?jcrbT035vX8
zaj%=+PpBBYm6MoBZmU$pUA)w2A=m0%)g|9I(c<!scoILhbr|u|=JU;F!oMZ6KJ)V{
z8%M3TV2Q$b4{%zzUapP5{};afdu`!IgX{IvY7;><-6`lcg>Z*G=F!UmPNx!S)HWd6
z1M_!$K|b*^?B3ea+WGB`oi%pT)@5AT$Sr(iYt*|ncQisa<!!gO<8%sZ5tOu+U+iq3
zhKgt?QFe`$9b_sonfZ^Q{?F*nS@ndbho#L1gCBVpY1ZS&T+GQ7arf^pV`;*~9_H-_
z&3H$7`SF3Ww$A+}H?s$wmNagJTBVsx)gN!iq=1_+Z4lj-`w30=xck`u&QU3s6cY2A
zERt?q1IyMo6o222II!rU3@1wBz@B%E*)&PI0fb5NlMvKl^kpk_sA5j@Nw7g6o<N*p
zWL^hkmecIv4bf5HMdeN5RD|RTgz(Fs;wSDjX8s)e9<up+aU!&-Jl>gc@7<Zcc~+iU
z*q^Z|DFgQ1i~)j)yd?$Uwh-*T2rE3;U2{7VV*LAc8fA#?P>7MIC)76HU)>(|Vy1J*
zzIiOq+w<kCC6lO}|K=;{{j8arw2FAoBPuz!E?eW5RqrxlLK`#pdsmi8$S-U30dwf1
z|5l5-7MHfSp8HR%Z{F-5&a(*n;}f>A8n&5o{P|<-*xe>@BbyqTU%C6#7yx_ko<+CJ
z+%ZAl<G%G*$afQqklJo?DGl6kW^00mo!EfW*R~S-5LSBYo_vP~G;zj2Q3)vyRUCM_
z4thtvZglgdEkk}F|I?E$zv@h%mD#(bxXE*eXihnrU<QF#;%=?q5qQT2+#mM+mCR&X
z#q@kmas*F&S(Bt)C-rY06-rCt{TlZUWBr#qh=(GjOyIAXR%hC6Ozo(}1BSAKhZ!Kc
zV;A+^Kkj{d>$=p#Pmf4w*(8_Qzl-{fjh=hYOl@B8N2>M?#TMDR^8I%p0*1bLP(PCP
z^H&@uT29cbKtLbVYEhB>Uqw<Dd3k;4J=}WWOFM%5q8)Ne_R!kG)+!VYeP?lAU=OP%
zIYFvS6yK4T{~eK=A#~+sKEF&!F!e1iGkoRY``AAm&YYCti>-VV32)QHci8_>ICEU5
zbr&LHRmQM!*gFz|c?9OQI^e2qQggF_#DeKB97|YBZBzPiBn}fUDHgGM>1YOL{Z3;J
zEvesmRE(edE{A5QMRy`zV;?S>KqML%&(HfbZmbxn&dkh!sL~Io)Kllx4%qzIa4~6B
zY=MMl)pBT*AJ%aNZ$5vPDvF;9j-&2Hyb6~=iC-1N4rM+LMAj)=($l}at)KjnTLx$x
zo+Bfp4kXLYF~87IEV$gMpY1d0ES6N34|Kj!;l@0cNCg_j5csm|^S25?{w6wR*3rlo
zIB!R{1r;o>^}j45v`75K90H%1b#KQ8ErQey9JfB9%NgelJ^--L(x$G{BPoLiI01@W
zj%AxMGCj8}es06NjveT%2K4ZBITQ}9Jqt9}*3i)xqC1;?pyL(x2Y>+^-3$-z5v$EL
zJu?L{NydfHm{<I68|bDA;Wg6^d!tzEp9Sln&8ab;oq6ug#tyd$<jlRkZuf<5+}L0@
zP*EqZ?J-qlF8RP(Z%c14i0)Jmdyh7T?2=3{CJI@`+ZaiqDaGFr7(=y(!3{Sp;fZeJ
zt&*b{rd_i8LOs9O7v2%6e0lmb@)&9&VvSoP@dsS6HTH_Q7+jJ%(|R;3;WMhb)milh
zH)QB;n{-UYPy2dKG1KM!_7FEJ(A+_(pZevHdq(m)zoh=b2k!<BAcXI3nxNygg~a=R
zHte;!y`r0uzfyI2gv?&ZG2hp0JZ|GPUwb#}&~cogT!CVOot+8!%=4(L5%_p5Z(lGq
z1p<PdoSa|w=U|x@oG*fdf0CWlI1eJz;#7MCbMyru995#GpX{PzGOX6M9#n{}BHdst
zB56>z-319i`ajF*<*?L63@ZqX>DD8R^xTlumF@X?JG7XA2&TKdkMzaIzHWGBW^rcf
zc0!JT5Qwt#XRq>g4#@n@3tQ*&N`$6p2RWvNXlvB~-z@seHfeDh1VV$(LukgK(`UI!
zb^t_)Q^lqL*z`Pilwr4}^CYTyudI+93K^+M*lVV#!kUb<j3?PO%x`G7{{Fp?K(zV{
zNBS4EcZtFKghlgGcBa&}3rBA)cd@(%PE5DwabAROUM0L`7WJlfc}P7P3pxWwjyBdb
z)ilJQ`T85V{`jxPq~F^ssv0amX+GK97RP63dTV6!%&oZLIc@Ni>15j-Y+oVW$S_j0
z^#T`Qle+e1E8zz)y*5wB>_PBqSZyeKRJWPy`En4*w611MPfu%uRkJAU-__?#?c4u)
zC|akJjZ0_{go(FAnaIQN(d6_LaN{k&t~$SFdKZg<%&)u1mKX0v)1UG$WDoZ6bx@V)
zQFnqV@%}Xs!MR+n6^@MOO=$Yg#AMbvTQh8lfZT>w>oq|iKJs411y>>C26d^MgC^?m
zTOA~Yc4jKo+HJs>5s-B#OBLldM9QfE4a^GdblzC@b)-&LS>tY;O>wH<ZXt%|po58o
zyP{+q2KwoQE6@u1M{fPk7*bA6B1q^I0AS9{EUs^R?~Jjlj2D9dT>6Vjhg5UzI)q=y
ziXqltaYq-JCRXV<5jajnGgTe^eED7CrwaR>yh=K5oNxbepkL8)l4h_-$o^3`09<vd
zoVB&JO<tZ_N>|029Blw+myZ+(@4O#KobzG|V^qr@BL|`QLG+C$^AqHx?wIg~H->VG
zHq~w`aay|-_o#|@%~K=<Xx3w2F^0$8D(u2mi~gc0cD%x>_Jjw5jK!aIR{M*vtm&Hm
z?w;ZV$kRO@Sa@dV3d{C2eKfRDl1-$!4FJ1hVeTJ`u+8?S4WF>p3DK~0PMRh7AQzuR
zDgKed#U)XYRNO1M+yVdQ4~j@6+$$|_Di5r00ph*SBC5;xV@btbc@s<e*8KE*&#;;(
zAUar^*Vrk`pRg*Bi(JwJAAZ$8?Y}Cx)2L_YL3*yqo{4fU1>u(KgX^{9DBpdTYD!XE
zTcw-?7fz^B)Z28z0*IfKMozbVoI50XxMt)6GU(_B*U5Vdd*8phwsIej9v-jvAHfny
z#Ei@$#7CQe$_s0VVhsK;*8rcGnET2OCyR7xw>#{$C9K?pw}RC;9+Y$NQIP;IICG?-
zc!_5*Edue}pQrnD;*D~-ekNxf<eI$M`!B@3c0XJMY2>B)&u@S?u{m)LPyylS)FYr|
zzl!T&)hl;ts%mi{+1zkrd}nP?2l+UT3Rxm^e;l;;0=LF>lY$O(k|((Lvsf(ark^o}
zO@u(_|A`zzOZF4?wn#<3gMcV*kPtxH6^)**oXvI{()4*5%Ii6uH$esg*Ap*OQ>Jd~
znm5z@K?s<)xIp8Lhy0n{SuWifSKFCmjjgRsiCIlLsdBESw|S&hqhc^Z7dEySz(v+F
z`ye3%M0My^qa!23Bl8?GIW*KU`JzdT+?zq(ZSDAY^dAJhA4gMH<0(jGvF?xnze?x1
z#SOdnzWnLX_)yc<aMRZ5o$JR`*`%HAxeakDWRzDwaw{o2CFM|Z+QQUJ8c=F>NNwE)
z!JfJjcCnr7v15+ha^)vrCtZfWj7QVW(ax#{tRER$$yNVC9f0H>G+J?#YK-pb(lrA+
zgs~d~1XB?7CL9?_p*cz2NbyhQIN?WbX+zPrqiVeCGDdp;(iPrsNVr8O3X$;WL5Csm
zfQ;87vEM{~+K?5isybaaY;Jl|#0pOa0NI22%k$ffBx&5+uZr|G-^mLyuwOpjisAm=
z>DRgE`0+^V#@rHEztlrnk9ZZ!y*f{PN|<jk)Ufhq^&n8u0tiO<`#KV8JX5(hX4-t*
zucqEy2O$Q0;@ZJGo^%5^@+8-pQP1BOVJL59`d%1(!fp}pd~oNXp04rd8C~Mm3i9nz
zhee}cgWBVTXQQCKLI|9vd)wkB%i=uoL~Uq5SlIc#?de?S(!pP@*Z)}=Kgg(HYENnQ
znc_dSD@4j(q>=})D5pI!XrsaiOZ8=)i`?reP&b|5rZ~ahrPOZKXWrXqCBCdnqp4mO
z9QLQ~9L9Ytun~826nFDj$Beq*3%)mN_DrbTgjW(NH^&D|l1px*#Zw^$(=&bL%nKAU
z$1WDpWluO0BZ8&N1Yg@i!r(6}+A=cOt~-CK2u00ZT^edw06@qzwZgwrzlHj2cbV;d
zw=SRwB6!i)bWh=6IFN#>EE;%Qz4OO)n-{izey6Mb#rY|)($gI)W==cc@oD!}JLE$6
zhLppL0x#e_GaflNaCi$~)f*fdbocQg9K%lOwfn`?wi%EbAKuuoZ93M5Zng1>^cy(0
zkg_q8cO1}MZ2Xu%Nw3|a`z-o6EE7dKef$gR)NVm}Icx2b)~`xtulZ?ELVJv1%!!*Z
zYjg2PzY3`%AJ@KK$yQ^}#tq5|Oo^b&f*+Qu>ILdFMF|Adx??0CN2>m%5)Trz;%)KQ
zi}C0uEgnY~0vzpycj=VKbVJn{j}-B$NrrS{?a2$0kO@)<4N2f5r8;fmFqLIRq4Aps
z4kTp)vWu)D{rWo&hjlv*Bq$!Qx+L6W8{sYSKH8U6-ID;?6*T~)F26Q_siMb#touTh
zr7n!L?&awYK{=4IoS-5-T42#lcN)Pjd<h9heyyC_VA3>NDyjvu&3VPNarM01vTn0J
ztj}N(n0^?`1G>i3cHZs7nSt;i|FGlkf9jfcHaxO?R_>!d;}5&%*a*JdYh~()KS_~t
zW-J}w9(&>SplQ1a3@YcsHp29DgRwHX?d@GG9_#jDe21h4i~ul>v-O;MarZc=mIO!u
zTwEo4OBb{YkEy1VLyuC<V5f3@YD+SvXMzv6E$V0IpeQIxbzKvC-uo}o%%lKlm)Cy(
zuITPs_j{D?rP5XJKpbhjU-vWh)4w6Sp1>~gje+EIYxINM;l&^YE!W-@$P}%``Pp#!
z`ZvZX_iml1W4$Og#X&eZLLcWPPJxAtlC*)C?`b=yau!(T{Nso{H6pAQzLwq^sRQDm
zyQKFW!mns$(N;?bQV2{M6x~S|M@?m>w|Q4xPUc~1UEt}Qi=Ha<gAvKCXnNWy7eB+C
z;k#;-fB)pr>TABB<=x?yrUh5xJ`>LC>TWN-*jYOSMmG!+ErF0hxGHn1N|`yBOA`CH
zr<a^!?U~-DZ3Yk{QRMUo39e-En4^a_6@ANugOZl=z`EhvXEgn5Wh*iN-brL_NfDDG
zRiQyBfK1~Ot7ao8Wx1ZxB|3+1iDJYOp>$VBu2|At<o*2%{mgx@l5vXk&+6U0Ot&Sp
zy8IS#Mh540^&(4?MEnvcOTU?+oeMtUo9E_AX6r<`qeHRS=HPU23V5q5;%$rf_3r;=
z(2Je^WvZ&BWD;@u+HI9V4%xHLNH`68f+2IDIIMi5r61mTv@vXFdzgLN@b~YvwJwh9
z{`O~qPtFlqZj0EOs@S_Hf4+4gV;Z>S<9Yc>y^g$(c%oqG(k*XuF-z+A@7GBch=@Z`
zn<t*h!agz;Y`r2qe9?v;nqtBU8FP2*hVOCP$xhQAzP)vV*&1~eTbsWtZl8?Gq|!Oz
z<DU}vgus^l>s_4}JJaVkdqy|8JEfeIL;MhaI@3uJmP45oEh=ksP@-8f<S!UIl<49T
z@XJNl>ztJsZkO^Vz~qM!FL9<_#$X9izsk}$9yP(^4c{w&0LeimOjs*T;$y6A^tZ|L
zPhR-rqf0#<Y9#W-OMhMKE#9^N#JkesSVD~RYFv9rPv?X$>T>CtgPb8Jm~l%`4d014
z(Iut?yJ~_}y8D9f&7nzGpMPLk#8$*(1z2{x7(}D_<CSly0pG_1ed6%)bAQ!G`f?7@
zFCay%KYvJPTFGOr!dG~C0fj>Q|0IznROXU|!)K*sOskFs&wzYoDB^Rir?kAsC{sUz
zVq_DQ!m3|O=Yf%%OR4qmuHZySq!}n@Q8BrmEyMsHk~`^$J1{UgjAh@68@toEaO!yS
zMYQD0d&i9ts70qTeFk=~I8e4Wu}una3x9osY65%@sy`shxM$Il|2z5>z*)63=Il(C
zd41=n2&S!FQ%k`*OL0TeyI^Ll;y(jFuA?XM#c~~~nU|9@O=AitZHU{8oTtONJ4Iou
zhqmw=JKMYToT0cPTOY>JM^_z_b3MOSE{}-vWFok=-c6&(N@I99;Rq+WOC9zO455WS
zkvS13WM3ySfeEf51V~c>^-2{>chWd+z|(kco-%wl<3Qd<GU8Rj5}VstQ!{)5EVb*y
z?yXI6w^z9So!CRDj*FMfJ(-RS`=Um!7up`4lQ8&opfHMxgBLZ9lBz)70&9TQYJV3q
z&QyP~dwqZM5_pamG|qd*iayA}*R-TkXvqe8x1RYgJ>DZwAeRVY9H&p;=YC6xQuB7K
zZR7;v;f2ewK2?L$PU*Q{Tv&QVt9mKFGC7ln7<YyOaIzBr0*dXdM;cw?m*Tz=PG<Ot
z=<G*`DBsklr%3Urdpq0DZBV%%#S*_gRqYa|T3|~4B$ZXt#viWsmM0y9;o|g8w)HN(
zAk5lxL`q5UH#E15+_Hm+hCtV0`<Zkr6fB#+UYJ#c$jO<M`yaA}*di{A8w$w0(^=M=
z+0ygS1oj`PDxL|&N^MQ~9mG{Q|09RHG!iLs0+tN9#mZC|-iaSu4voPK`4%vBxN%kx
zxKYjjDj_z2&r%xX^g}$6ra;N2CCm4A321?px#umGrcq$XjbaP|kY7B`^md8k6qzoh
zYna2tk$Q5~@UKU{!9hXb50FA8xWr$|p`nn9{ZtLUR>X@JuF;ml7b7zL-$zkRTYfzM
z=doP=LCR1u=-0Sz(vzt#?Y_TJHLo~zK`wD$X$=iw057AVnAcIV#B?u4*RQCq$=7P=
z*9p)A+6M;^v>Uo>He~WTZR6w5JM!{;aQU1B5-Y1OW#{*4V0HWrE)bqhgTv{Fn06oI
zTTu!4vL$236HHWVxSlLbR<<A~8+=~miWy?)9xf6l9ug5BG55uS;pLaTR7P4S7M=Ja
zX@;+8Em<W5B^zU`t=#k!#^$vhLM}%C42Q1}<O8#Ie@}WXA)fkEpjbTk=>FReVDH@D
zB2HYcrqJ5@i%b@n?_+@!LNEp=(X3VQtTYo6eyaR=R^{W~5hVkWGL*@QLQCMqTtF^C
zXF%3%pS#GjEB~!0nu?)Yi!w?N!mN}wmUXDwS81|j*oVVd)FD|3oVvw5@N7t<Cpf9E
zgp6uww>p?q%;8NuiiOZhD%|;%(^XaPRqzcLDBy{zn(}xs^Oz<1Q8{55-O5`B?X$rQ
zNwH(1dydp&2>6RQIMS;FNon06XDY&k#bJ;4zD;x<Ws=T{OmubmgTuQCWp?h-xtE4-
zx%T+ZN@8GL!|MFv;^Lh%zz-@)mmtm!JM-4o)cYPIt`+aoyU93M1xTqe*gPj_!8G~;
z^=+jY24MeD(|s@Wn(l|4(#m`i#hO9Uofvi*03W?E=4XX3@9Q9o!fGZxeC2@P8#0GE
z_)Uksig!tNy#6vrK#)Cmy>2v=w>|7#bvCQ_c-S9JbN@)C(2C@x&CTDxe+Qj8c8EhP
z`L)NjbV#l5Y1G8kCiY5E<Fu5Q@|Sp4gq(k2O^%<cVgKs$e}@)KKqKk>O}YT6jHGP!
zG<r&-ll(K6%+dDT5c5QPJzhQFAIzQRnnV){Evat^M6rC@E1Hp<NB_Ns>6{4p8;_2+
zW#Zt?>QY2-4yGrAoiCyuv@CW!)5*f^rw!p*SHlfaZ8~ue155hd;<b#<R%tSZy(^1A
zk3maq2wONA{3=s&&*u}pyiDr~!ufNd1=#*{KM^p%7_T=llv8G^c6AG%7Pv-dB5V}3
z21|x!c~hHfzag)ygGS_i4*yWn_wM@T4zH0w+eRuT=oEf%ZM945a+T}Gp3u0V%4Z`l
zuQau`lL5Kq>_5Lrw=r}|ZY3Xb>&8aDD=O{W{!>Eo@5FU3Uss`np=5O&OW=}j0bTgQ
z);$%v!?dT=^?ZdSReE>1zjq?Fwq)X)vU@QYVUCXM;qGeKorDyoUt^`8{d&3R2-}z*
z&os=b=#;nMO-{G=c&9n5Mt)k#R2;}Q!+?EHUpsj9bfF*JtPq1b>~9^NEiB{DaC&Be
zJ#_ayPjcX{3X<szW`e4bl7~~*eu)`M)T_};>;bW~M<axjnGm5Nc7^`aJ)eS<Lr!^D
zts;eSa^G*L<^4(~gE+W`MPa<h!)i~q-H~V3C5X9PlJtofm@z^a#i*cpN$2kLg>9?t
zEbgpXG<(#38I_{@b-IqF6?Yww(}&~>;gX4x+uu&25PS;q#RD}t-)A-Vv(yhzEo1k2
zQfz*v3z&5&;|KA1v>GEJJ(xJsuHIZQsMO9&Ui?y@ehYL#wJR>VTlwn5J%(10Yk)`d
zJ{SyjqGpY|c4m?WwJOzU9DeO;`sTx|>j(i`MI$Uqzm`b?*|uQ8`{g)IkWxy*V=g2j
zzK~r^JG9XM0^FWkZ<?SkEIdM*NcQOFcrF~vQDzo?5a0RWN6UmJ|K-Gu>_L81P;pGU
zBY7wqx`wW?QwOvYA_JY5SuP49(C>gL;(EAOoS`A7ZKCYk@W?hhYy${Zb{3Cs1Jrvv
z;4^&z&#8Lly+awZ0Z%RKUs(SR8I>*)5)>MLd_bv9$_cL47nSP97}!8~IQR?_G7u=(
zO~!M7<5!mhHnz<0GgDLF8`!tB(Ko7e*qRWY8w|f-d{Isc6&x0}GnOF=u1HbM526!(
zOa-;D96is8HtY7tU0Li+oo?O`eLi=v=_Uj0N{7cE`4Sq3n>JS1+)%Ka0vpo<v(k6#
zo_{6?3t84Rs$u`PIu@kc2BfI+H|W2Ix14*Y?&xNjtC83Y`;eyjFkZ&!t;z-~MWZ~y
zVSvJ~lN1^YUKnXbw0Y_swLnW|6$X&<7==tAF}effuIqUj!}A$|Txs^vC+#UId7;dP
zx&}4o{^nNf9?K+X<Schs=Sg_QjQy<b*?4d<83|dtUJh74krL4@cEb6e?mv-u&^4vo
z_Yl$4RQKfoQWivxD|;qoU{bF^T@ab)dxytQvHa;#O+k1uupG~0f`ecIQ3<5Qjk6h*
zHBPE(YI_Ey7yd;}#U=Qw!Z36jKU{GL``gIK%2pwXUBZQX%H-wcwU7CkVCL>-EZ?Iu
z`7J3T^Igc@YzOvTQ&3ms>1Kf+98~JKITEmsQ&&a<zTQO3-3VNqXc<)(`dQoSvYXp*
z@RjgP#Lk(Eav!J~NCI(vSQII2MUg$omrS7B6st|2Riirzv)-XS-t*3oG@TyYjyzWb
zbHHDVESII>@E5=jMN*;{!f>Du-u3$gcetKLh#ib|tk~F!|J1g}-cRU#o>Xn6=T{dz
zwy%73zZkLA7wlvh3<*mm@4&M}m9D!%ef^crL(GsSaeltXRDRg>7of7V=u&r9ZAQ3d
z$M)Xm)Js4&J9Z%%0gT(aK#iQ4hiO-)6o$iF(5GrnPAlD}SoYE<`I*U1VsAW;=$#Tv
zL?4kCSh+)_*K7d{q0nnXR)x>}C`G~|?*Qs1-7&I;&7J?Vv3#BPJ9ur4*Dm#1;@XiT
zuS=QAU3Y@WdQT4>kD%hc$PJZ|-|t0eWrF1-(@_cORUl`KFhRy^i~wITpj-6q{C?q@
z<!a|V8hDhywxXL=Ume**5~tzudeTI#!)rg2aQfMV2URfui6AbabIiP1JsA?@4^+kp
ziP1^vvEL((_Z~{mpF4vzRDoM6UTqvlJLK~P?Nh$~L_*f4L%jrwMLn@l`FOP*^R{PO
z>5n`GVNm_(--Z8~=j3!^dBG2{&aT`W=$3)9Fy^u=7AkhXq!@{)wqg<TX!ZVWU)u<e
z`rc}_>@r4sp|x76N>sxKnXh7vG4h7g2pA?w9QP6X8o~S^Ip|Wr-!EKa$M;QkfPzk0
z!hgL$PqK>$ai^q;Tc}#cS!<B1*j1#x#sw1Y@q_mqK}x;4JEbeK&G${NBS@Fj9)EOI
zMzw~PTXu__mfLlA7<D1`Jwb(Tm@}~m?r&0%o87TQOK7=h`?fB4YMIu6_fbOwsd!QE
zO3w0iclINA6+3>gmiA3Eeic*$+a<t^adqR|>4Ipf4^w|z`CTbcmrnd><IbDQT%7U9
zeUjV(L8e{&p!2BtL_TA9&f9J_$<|1*(6XJ}i$4D3A9O!=XH%^=WwUB$j$3kg1!%<f
zzJm$s=MQ?3B=j4cj5spq4>13<N3{<WmJhCw+AmC-v5ZZqKZ5o4r|fk}ls9yU7iUJx
zTsam+eJzVKmb@0HH|d!Fr)YHZ=kRHtorR}i#u(iu-4q%haVVqGDp(Osum6j-kPpjg
zb9giynSj5TYd7+3xb+PW14oKE|JJ*fd*fAgp;DiPMPJjO`MR4_iHM6y(fV*4yh)j*
zeum>FqU)(&86ht10iQ`x69uP;q7lN099@|w9G*~k(BbI@jt|wo9>SA;4-YrG27E(I
zIu&aTrbb19*W%cV<W`__93mFTp$@*wd@biFc97F{6%5^CAgWF0dJTy*RCQdhepfy=
z$ydfy5GCf{m>K2a7!>~DL;{WK-{r*&U7a6hO}IZOOfoEV8q+en?BB2^%XDE3MSooQ
zEQJgIdCr2TZ~Q3RsN~+1?L{;q+}Ie6SAD-ne6k_TgFv`W-;0v>pyFpylyA2NArr1M
znwsd0Y0HFEu&voNuZ-Pq;*nnRD;s9!V4jkB!-t`jixkJ+K;25u9sK!FNj|U*{0^y1
z8~s}Z_Uh5V(@sjqE%N#R=!_ejDprW#H=#id7VQ(hcOTOzk`K0wpe1D+;V*&bET)NQ
zX1GmkIe(70u~bAU9CI;$fX^UG6170-BCrC_JzR9V1QL4I=4lrfAd}=f1owQgXdM+6
z5Iy4H#Xr6Ns-+x=qmAV9FLS}hU7L$j>otR3@n%Tli)QZlUJHw-oUXJS0%J&8E9&B#
zrs(ryAR_0jeObF_fbidkBsaardf+bKS@hXnZCamVsZnzfepx5cIeWpFx>ZHF?$j&!
zAY(`P1xWpq*nb(1Wy1lh*u6Lr7j(SsYJP$7O9616Y<SBuhJU@K?cCAM^7J*mQ>K{f
zEdHh@TS8WxQZo%D&X6y?v&4TyoH%)9`O1UYw#Powch@vBEfobVvD;5J)bnj+@$$A-
zF+~xyb)jjMn;nisv*-y;dG6tGXQkUq?11#NXA+MB9wK+^|Jo=KBZ2FLLe9<nG1Sn(
z%Qj<ob{je5g4UwkrI_|`-DtOmSqWf3oxLEip1xX~G!B;u?SUcSUyOPtzDfCa00O?X
zBAtS8$Ajb7mg3N_2B7H{iyOo~VvQ{v-)<iL_RELCL<$J-ze7-I1Jb4u_=2d30xp3O
zFZ(_bTUl1OrJ8$-;hC;Vsa7_{$DAs^)pax72w$e$`;QL*G)8@I_x0<wVCnSjwb)7V
zNaK*DcH@MRlT^O&)0LKEEKVQG!=ZF2*VC>ji}HQo3M(~ZYeb5yD|3TGzHBY~+A!Zk
zy4?eS?U&};=cYkxbc20-d&=UM#5F--AFb?zhKh8th&a#M5>WJ~r{@CBh0GrAL`D@?
z&W;<x2-15VW^`uNOjQhTg+tR5jlbKzdwZ$ezud;F+>uCv$p$qMN;d-35Vei#ywRQA
zerc-Um|FhXuVKcmn10geW$EC#|5*?f2Gc>2VvulO_gG(Ed8G;Qu)B1-6&_>ed)vVL
zi*}`JKyMH4mnph#?uq+<P{+52leApVVtre?lFJ5ha9K*RmNcEo?2aCf(#z8kX;yc-
z<(}&~dH=BmsZB?@OT)|>e;85BB(WS`L*^{ameksku$}pkj^&vd;I(q`sn9la7gNvc
z-cWhPudhdGC8K^)e$<*D2|1IM?0I#-#Qal@RB$0Y+5^PLWomw0w-VY>?t1$x5JW22
zSF10vBn;#x{mq5MYfg%%nlAP(!k;jg>2ECxWpN$rKj#Fxv1q#8matI$YWHE^k#$6l
z#Dvb<il?dQ{Y;uN-Jx^?B8)UXe{#wYj4gp`N2G=9Y)MBrMMg<^F?wsrfUud-<5li_
z;xh=pY6fxWHX%(*s%r>_RWuFiNOKPS7=^xZ(M9zy%SAT}QjTWQZWhnijYwGwt9GS5
zGs~gLNnRE*?ZtXD;wnDrQdMdQ><SVwj=`iLzLX^RFCqQ#qqSTn2Gb@5;X9;RcUAb6
z9UK|XUIp}o>dtYpQC`LFa5BikfKwm#6gmlWJ+Pd+v8)+?K4cUALhs!9<L2$|Dz-#H
zda}%%R|8=gmwmY{^O?309_e`T6=+T#@h@)zErj9XA;YDlY#YD0^IMyXPj@x}Vt?m1
z3oHrNmh=rtLkGrx>C)sMIe38t!%&0vZXwH}SXt)o9H>Z^uQV}Hj*{Tk0F)OrTFR70
z=AP;N1LBBDiH6np8#aC{&4!ssL_OCb)BR0Afuz6S1ttxDM{tkdJ1LZy7t(_fX>|qa
z1WNEdFa>NBq5`rXQdjRxIKPN5Jt}poJ$QL`(Z85^z3RCQUQwR%uBdjmfLKwgfSM45
zij#{L0I8I?zZ378Q~td?e-RkI2>iF^2RM#DhGKGqw31%RYAYC`1~aZO1cmu;wDAWI
ztfsW=3QWLDk_8V~U&0?|Ynrh&y0rcz_#Ml`OwL{>Rf|-X!L{<7`~pTUU~dJU9Wk}n
zpU#}WyHVvcN^p->v@4d(^}I*MDtDoB)Z2=yGAif9e5E<mqR*WU%^B2>^WT;ILCls-
zke|{SUk?tM_3I_E01Izrt`3}`%~)@hp^i#Vy4}~*5^nwK-e)SAmr2?iTl*9Io<y{;
zcsvko*{VBDV%f-YjtU8*Y1EtqWF+}6>&J^1s<_R$EjDNjXAHJUIjRZ5bc1d2Hu*^4
z*<`LX_uQY4f+_Oj>93yJe%Gxd-$k}Ik9}k2a({VOJ(M9{*u_uBWy>T>OazXG`Ui%U
ze<?Y_Zjs6a8jk0cq*j}k4zJ(v(`vh)J$*Ts-aMuo{}58C;(|(2$fX+3uNfIf=roMo
z&x`DntvWpqMXxq|r4g(}>`x)C71*#6Fkl)jtzsI3ZnGy@#p_K%FUtUJWR$m7MiWUI
zs!%Y#Qxw`j*k!{>&U9zL$)ej((iPqPtuHa{UTghs@Nc9QQ}n~hcL_=-pMqK6>KG_)
z-PWNwWk*mo?5jnrg`*PcwQ`A<&1ZvFx1U3Q&O8tuyQAAK1$ANouG9Prp@@-|67xE~
zX59{7Zkpo4RbNT<H!g!9PiJC)_wIL%vv5J8+F?>F0#{<Sp70ruidaPkxgCBtP?53M
z^*^)+{hE2qMfr#W-FPwRCd0~dA=DAH%(HU;cF}=ii>Fzp-O`@wI0E|~qA!5b%~C`X
zit0j1#lvv%h^&Ww^3$0HvCzld8K|?%Pn!VtC&B@lZ7I!ro3f??n)8^-@+njbBKSjQ
zU-Q_V{oUP?pQV(NWDO6!UUebTX>PTe)_EJ=0|Czwnn1**$N7kdzkV|qE14PCi!IPR
z#XwA?3wqM6@m=EzzPC>aqqF@KjXb{OT$Xu}A1rq_<T$iB)_xqchM%t%HJ*|PY5Gb!
z9beX4lWD;E+H;TDG^|=ezIP9a%(;?{I#rtr3U;S`MvMQr;136<=Ysh1ahI&W1tyD~
zwWz4}ydMXiviv%E8m^LsS_e@7_@y`n|HdoPHI4bgC6)eBI;a4bg(qE;9KZ`dFy?ek
z-O;laqrT~fP~}kum%ud|iNy+dsnE^IKu(d0H~IP_muM_}!J#YQ;0KlDgbxn#r5}1A
z_c$Ef+Y*WsER7D#`mKJ?5`Wm*ZQ*~uLnj6QH#a-G+@>gkrdPY|C|ST9=3gd3gvBWS
zA<pOj`70QhqrvmWz+w5E|BrTGD%Fdj94ZJKd^%o)IV?t0x%Dfr=-D4)tU|ElnYWB5
zF<)H|d5YMMRcjmla(G4qc!|@_m;DBf_Hd5B5bb(<(*x>`E||o&<}(lG{ArwkDTekD
zBWZZJuS|RMSdN^~1$keAk{cIn3f=%jeWg9&>4c8j0U=L%JO(8saQpG);{O>T2TNvY
zHq3+i(db;7T~QLP^JeqJmy%i2f?o1#Qq#|jN=d`ERMt8m(5H9|;+*77x-$KZ@>#lq
zoMXpB|H9JQ_W9V>)L;;xmCKf)J{45ldI~BG;MgD5jU~$am(=SBQlN5u{``>|1n$zz
z^`=qoYCCHN;0(GnI8Kx+&zz*A5?6V{g&-mA@Z8@FDV{4qgnLLwT?PA)skx{se7<M8
z>wYygwLaKwy{BI*^{Nvx<*0JWp1!PWbw(w1Qd^6f$$omzpZxEL2-uz}p`W(1CMKS(
zhNd5aE%aJU^lR?xyD!Mc--dux0EOUDaalZbB1q`!Vy}Pj&8D9|KC8U9Qtbb7$pLj;
zkXO$|cq0BV2h1x}c;dhU0hSD2L=}Wv9MVa(Swju}Z@_cf+cLAdVY&pLrHN3qT*(oS
z{Y6_#Q)4qTF?<htBO$LyjXr;o63<)nXTEQXTB7&qPpHR((b{1#zYds!F~L2i^$o%7
zvHSg73<b(Z?4nj_MFI1eiwkWKsgvw%jMm&Ua46CLXB~*znXxpSR3~)ApY9fOpm$6u
zxBr8YR~B%X7DQBh#^KLfU~<V>3@wqve%{Del}uwB>p1#aNBN^?d@Ep#AwyoP{As`0
zL8Cd6^OCL^RKz9HVv$Eh_egMtCZ%VcbY4VxWc_FLz25&Zn2X%s+1}#xy#W8L@l2S$
z#~g+~Asg}+)X!ej$a-{DO6P{-9p|diQ3v;^tHx+5;;S_Om8WRy%k0YXcxGFRl$%Jg
zj|5J*^OYC{m6ZCwe*$BN0P+u$pdMGJR8#%qZ`{J@0|)Z!`E-%HnN<Dc%8?^(AOv`*
z@^=z}f`HH`0zi5ym-cGl&&o<ql+Q>*$bFyU&`<!2YTa{J%lI{DZUK!TzO%V&%yoOM
zot*h?T&<m_F7nrcc)KRkR5fHsT!`D?uh&9@da=UD+GV`3m_h3BIMvl>@UsS?QWi;7
zxGlH`mH7EB>GEq${(HywtAW5MN<OlfKJ9nWh%X{=-sxrkc=D9rG1Uomm1nZRpF+)%
zz!T-J#O=}-!zm*11(e%n<<pK|wJ>QE8@%n{1AWy<m>&azOURkf3HaLDy5NP>Al1Co
z*U{DeVBpYVs9f+5j2;l3I868=DS&_!3gDQ5TAg;+_YMbwtDSmBYD&1E(d`m1R!`^D
z8#Y!1S;<qbT^54DFEAq#DB+UA$UP$ns$mY%r^DY?z-0_`G>WAm`NJd4v;&?B*+IpB
z{{Q#F?ovOecaD=O{Y3$7_PUg{gNTrz(-8bGUb5%zWpN}v+}1<W{Zz1I#>blj|K-jO
z;iO^3#=RS?#h@AysIz1?zOu;n1sE=?bWO@kORR2OHuUS=m-2FQX}Twbb=T;0w0b<i
zcl(7dzW>suasRqw?%Z`o5`kfdFZ7g#^M;4b3YJs)nrH;L{L4#p#Q}<?UyV(;rxa*E
z(_t?)?W&yHdq@oWZuK@Vx}R!q4UcTe@83&MYSF0EG282d*;!LN4~%#eDf^JHz$SxN
z61dyF4hB#V*8ouVCdl<7<%n0L3<*)p$*N1eJ$RF0P4q#kgq!YjHRLp_!#11XHMBAt
z>Y<z)r@u5kZH{QQtQUn{124=wKs<dx04=$9q{P`K`ig}9A@|?!hkfSx7S<Qb_<#8T
zX2^aq2#=&tt@ncv<;-@^^IpdzKv5qeg0&aHAW_tAx}TnskqziLs;2ypvn)G4bLVOd
zne1^n>buY<O~ygRf%FC?bRg-WKzI_|Ki0^EIHw69I|E)v-G$$S%H!e(F}b$?geTn#
zo*mMNlEQtr7*grtXZ{WuA`s(9(!@A<q{H4O^togicdy#yU2G66(a4kWU(tyAuR<m9
z+bwO7S(igb%DP`__*v;YP&Lv6laIVafs7T>JnX{rD`GMC_ze}bYTo?13pfSm%uiT{
zkiGUr<MG;8WMKzBt)EH6&j;2m-#h9She@E}6Oy6J)}{c<C>xzX--kX88M7Ve0lmOd
zB4d7v-tS#UUL6CmfkMw>8#k%kC{8=#LulRojrlE|9*aIT<|}P8^O#d<PRlqIWY7iF
zA)Oa$Jl$P34BMf~A|SabH!E{8ca#NR!?O-qfecv4P{2_Xj`67-fs0ME6pm-wA@XTP
zE#js85t+R46q<N;y@-LmL!g)YC3<qE276lZ^v9Y>8oN>q|6J|R@ejX_vQ>iN_K0!_
zTULpA+{m8uY1)|2;5Kc6BO4#B%OhxWX=$Bz;1yi2QTJ$i8wjTnh#=?C#RCuw`8J2W
z#>k=B09HkJQFjd<jf?)G^3{u&PCjO>NE%u$(#xH6i=``~L6*ir-TuQ6E9sPI8X`v=
zP-Uf+ES=j}tO--Nkf1<5EWuQ35Ya*H)Tog$?UA3>>3MefAfrbMvWmWhu*TT&^%JQ2
zvI+teirwk3ZT6@su~<~^k^KpOb2|bfAhrOPq?=VYQWUKp{lL>xGMg5wwWo4$@5>7Q
z{IS>-{-9RUn@_0-*Jr1!RW4q$Qo8+LC2YF-+5>94Wp53w{L>lw@4}LIrFd6V<2%0s
zW{$D3%wkft)AyhwDDy<4SN2~dg$ocm%}WE7P-&RM0~5-~hSa!@NVK{L2Sn-TyjNF-
zj~v&%A@4vm9gCj%VW1yn_*<(m%3A1>J4j45)yRJ!pWy};d8CD|Po?m``SQ!U1exoZ
z29VUHI1vsn%{JYzKLB7XIhl5T4I{7*x`)hf2Q70XE8F>L9$9U^5OffEXB3Z)8q-jF
zVywhCoe>Y^GjS3d84ZD<nFl#eqoqI#Fj{Jdyo6k9oO^AdrWS=bCr%ZS)F-ya7a-*5
z!dJ66X+=deo}VvsI#u{kc~r_F)xAJA4Gf+4S46NJKjE(N<1YZVi{Oc-h`v{OoG`B8
z0!5>*3sg4@hpjDy);aIm1BZVptppgkpfh6X!gh`IG8_*F#@9eD5Yz|g${^klYa!xL
zV92PL(8x&_d2zyNvjgt4=Z=K`RF5;1a=X(d4uQz9)}vKU+|LJ40}pf;%co?r`~7sC
z4suRskWR>PqW+>pdB56#Vm5N%-tXA26Yo^q0d!!thaiimqmKXkQ9f8soOxY75|ir*
zP*00Ny*27)ol;6INh0q)q-D+mu33)oz@8Uw;{s`flC&t&5VJB7FIM*`G1((4ufXm{
zZOPnOv~3)wHFxk81toB77>;IW74O>}F8q%3S`^H(y2v`N=FH>)FD7AmB+6HYmfdz!
z;uQ=`H$BDtssU7qUIql8(L+}fsbd<M1`Bw+b@$uuwb~Pj0ETLf2eHj$ydwQC2Q|`A
z!4{Fr4^c70%NJ;<l<rs&P_>@^aP{&OMpx}I`&Shh)dLjFX^^;EPjSsj05A|xe+pY`
zKHkeI*WL~B!wPIh1L|{(G19LeJ_Vnl<;*szo)SDt`q9<22E;XnVE%|b1HAh{I3(`k
z2Z{0V=}cZyse%CF5kg0n_V6h?_iEoeO*8BbNn5CF658mLzu#L^*opjvK51An=+_;n
zSTH#L&;q^qpgZrNn9`WxjSNt`UAPgpQnVH7bHtqqE&#!L=g<9300&`3G1C)^a+H6K
z!obUzoYic5=b@n-o%(#dwB!9C+C%RJwy#@Zmt}4NzgskhgUb1KtBS-<HA;ds4jS?e
zi5wW89hD9#XM^r{fdp<DAlFPy<S#ssChOeRbLJJgWULp`fcVOJRHy)k%oCv;_7X{i
z=b#j1l0@2mY|XELZ*60?=}#B&Oi*NOG_zmAJ)0<e%Cy4-E-x=O1tnObZB0D}&iBna
zB+64P@dHpQ=eT<@JshE2s=&mfksRo)VTYMy7_{WojT?L*MXW0^_~7B<1~+!_LzlV)
zvmFFt(L>Z2fn6KelTRA7ES1jY#K5u6^g-%g*9V}es@S^hbW%K6>3{a|YW-AeMfp77
zQ_|W%>~Cky4L2lsm47it*9R8od?qyD<K8TZ!rxsVXynHH=t^<*GsmtI#)pRitdV!w
zutqu5<$A9<)6DuO^im^Bn*+2~Ljl-}I0v7~kkR;w1k<afdIvV64bfy@`^OEk;|Yl+
zv&>|mZ)8F5OYM}tq*Z!?687%g8<^4P|Lsm8+u(1Vz8~<~ZusX@P*rNrI~9sG7VbSF
zlNXbGu@ej6Ary&`f4I*P{i{lbREW%de%a4Fq^Y){X6Ea`7)S)$N;B<@F$l=01eu?B
zX7l>w!)EB&Akq*$wF+L7+go)Y_FUtXUKzvFI19u*@*o^cB9*Vn=hAUrgb;s!1Gz`O
z5qL9Ji-bQI(<#xb<GwS_zQ#aqdsS9Knt#uCdED*|AQb^4A)f^L@JpKtJKPoc7)Wi8
zc<)AvDQ-k<&fM`u;?Z6z(B9XVL}oOcnhz9{awO1Ggac(BbT<*CKxq<WbihinV*|#N
z9fatze1d+xcqLeMW?z@aKy0S!-vBe}%20swP%9#&`jx&EykeLXk9ptj3$PU#b6{0V
zq(h)q^FiGyE9*tFuNOPnUhI-Liy%}cu25k|IP%0u)98QsPum}2=;ZXQV=tbxmy7a=
zWwc2XD=;`XJx3oGA@wXU>#R2M2<nW?llBSpT69bQvD#h}+#Rsaw#uQF#vn<fLa&6i
zTE5r2pHzGkaQYOrQ#vx7#8%KV%7*w~?0WVLkB+YRa%;nOl*#b#((M8CN^ADr+_KZB
ztHAf)FS@e?sCR1bH$2m600&pgJ=3<4OMUY}r;{^<H5}EpI}Jiz;BEQ2w+8W<c;49$
zjl-{qMIq`%yMf2!CjCjfzsw2NG7gT(CB%|$xn~FuV5_>jG|iZHH-lnkA`iqJ|ARLv
zp-+E!_J@E!JS=5>ZSx=Vp$Ewlu2x4Sy@uxk**$k1zlnzZc?S6n0L@wR*cw<$k+X6T
zI0aO(r6B5%fs#1B0Fvd<{0k%>T{XslP-_By$yCCY3Rb0(xOpERZ8nqVEybxMReY9I
zGHr9{M;pVO4G>8cAQ26!=Pb=ip!*>2E~3TiDWeg>$SIv3`SLrF;-#hfNHO=~FEMIK
zREqqm2d^->lPEb)un=G9aowJBZMwhONR;jfp;DK=_DwoH-(#I<8UFpIA*{n7hVVe`
z4eh|5F4<Eu>e5O#P*gnha;R-u`Xlt;*Y3v}+bJT+Veua)7!G)Fe5V$Y2m+a$nzz3S
zB*f(+DZ)?;;nRi_{^a&+xw=L?Xsi;s0cKZAx?9JthIA&vu;ik_2!{&HkR?eRoGoMw
zz@8=DM7Fdudz{DGE8OA2uU&0y0%<#7lAK)5d*=r@9wg3z5RCXSv8s)&89(W^u?=n`
zcjLNDij*6xVzw<=BziLQ6oBIHKBz8ff{=D^zk24mi=XP?SArK_k3;awO8FA?rrY|7
z(9Z$;t8wIw6*C6FU}D#@X-El{F6bUpiJ&Z-*Ek!Q&0GHs{z9}(FtizIB<aAuO4Wgl
z{+%<4wGq~!{G;pi1P3=PWWwBTk2r(`-e+s7*q=vVM1g=EC>c5r?VIq(%FX=WV<E9@
z?tPY~o-&75Ha&2?>cO{G1jSNytpwEgdluJ#bN%6}OXlP+C&j7Zy4AWly<AHr+W&EM
z?(s~%{~sTzMh+ti$tlVlTRBZCMq*BNlf!Zdp;;13&WHLsrI9q}(Seb}oJY>(FbPvF
zQ-q|fS~=z%e%I&M|9S9uZ1;U#@AvEVeCA^5d%le|0;SPr*Mf}K`_b;-L49Y|$YzSS
zn7xvR2#(zU6h<I>SAv)BU0+|1PogYcPzxx~4-$NGOi}S(k_ueFV6GQ0t?4z{7}b${
z&(-$Jfs&L=v5b0u)?n}(WMX!O{|{V9B2JB);AORBcvwVh`lip*-1HM1iYIuL`%8mn
zh9X}rUzVPlvirT%5yb(Zqa**KH#sV-m4`^Cg{ZomI^_RxJcsCZK+lH7TB;*f5~TeF
zpP3^$pp7I?#wooU9K6xd;oVumego9GyckaF((KQLK*<Ew`iIr1$*LmpiLgd>EZ&%m
zaIVh?;=RCtpV>sbYEmxwP%rclfDMnWU;txr_iOZCIZ$JeNM$G6`AV4-lCQqIIYP&k
z;!f;c_KC%veEPg8l;eqo{KPL<@++t>b@JvE;|Z_A+?ZMWxDJ_tf0<RyOnxbhJZw&;
z<!YtWca+HV&2~!K%hBjIH=UGVrktBFbN9^tOTaDi^C3)fTAliqfq1A3{KC6l=G%jW
zCAdHIvMyTS!4QTmoku*AZ~NPrjn(t_qG>eEzfL?i{%@5G*`T{`ewV4S7kDc@rgK=7
zK~C_9h3F{on5%g2C({ceWBfdx$(ZCzlf&tW9)^#Ha9nT{@Ha762rH(RkjW=p20sX(
zx&x)N@Tq5<#w2Ne@xXCIx_>$74Wv4z3nGS5CA0&S+d?l9_UU)|d`-SwI`F{Mv^;va
z;jobcZ^v3mZ||2jMF~u7{hJ{e4-95uYHY9H+FIli_oTrUNmZ+UeHzWV%9PN*`2<cQ
zp>z)FZMNt&?ra0ohDm2)q82xJXLdXhkZ0#6+9WXFC9g7_)po<^W(AXET>JYXPb{Kb
zzF<j#U%YDUwLttKdey;8*q;Q|Y$%x%X4-x62Lvy*FQtErbJP(In(oPzE-f-X36Mk9
z{v*JqW6yA@HBUe7ixI#$6F^N+JU*>TH<#i@^U1!y=IEzW+sr3$xic0)RM_LSKrVix
zYL0k{NZ#R(J#beQ@$m1f2P_#o-G&<WZP$KGByc8RNn2T1^B$wi913X8<_`M>>&rZ|
zz590|G-)AmFnliz^PIddUMc>@b+rovH>4627_qTUWPGk4^FDE69iN-@+&>RINWq$`
z*kb^-=AAl0O@9JuD2I0FAG|xad4v87VtsQVCd1AlUC()cJwEL$k~m$v)&S0lnu+xH
zjg6b*H<kc|SgXX#3e}@cdL&X1bDk>BjNXs2e+emMb^iSrNFlugRh6;g!&;oMf)XGe
zs{Pq9w>cc{4bb6tpoD&W&e6iFEM25B;lu5=GmnRs3=ckMeH9+kQu;7^d*@H%WQgky
zNQfjtP`rRLJs%V(^WXpazV0oZ<uY~4V5lD=D1U8`g}vQ7SMcwIR?Akq|0bh>*_4%)
z^`&boFG85~<+!Tif4p-Euk2#)f;u+N{Coam)?L#Z3-?8s=M=@TC(`Qtalk)Z>ffR*
zIIZ17&hGda5;fA$^erqW&2O@Slyo{@`0gbH=J)MG4ik-lk_=b)ARsgB@DxX&e)tC#
zd}mBK2|z#h5m3yS@4%wv#25MaTN5Q@y}V#Abw}dFW*|kl#Ds(g^tJKNPvlF*VcprG
zTn9^w><lm!UidEUm0RpBBUj=r)%$7_jKg{ZZ{!uvIkIzcganGG#f^tbY4hB^%UG~z
zV;KfMz7q#N@7h8UhCy#6R6Cx!JU!u<we<E=(~#~v$L%T<R>cA)>BVPsl-Yx@D-b>2
zBN>?u2(A6z6q5l;4e~#`Kmgn9kb-G)pO&U5j?i4W3&zRxcEbo}O^JDa@$<=@3CmT@
zyfjA7ABIx9ZnnEXWjRaNzHkMU-3c$sPtwEZo)#C=+1Y>oOf=mzJWOXd&+(Lz%P>+5
z4;cmm!_pr61LmiF%I(dm*%SSqLd}ZDm5{H(wfU}o{21QmfYa#j#E205M?%!qfIa&6
z^@~^zSy+ohj-&<GeX93wbfcx<VV0W8CE}ix;y`_OFu1!)I)D)<%I*K^+wlUYCSMP9
zJl}-M?VB%$NF<}mxMTLe^oN=po=Pa#tEj{H4ul_pIg~0Z&QRbN;edMCm6aGMS+7mC
zhr!Qi``<T3Wr%aKq4{=pvMG+R$MwzgwOCE`*4QL?3@ps;M3EG}$Bb3ARJ!B6VSYDj
zo~1Ddt;euPY#-9BUDk^i$tRES<avF)+A-ZVtu@O#__QroUy;TZ7%BR;n6a;CD*&m?
z_9Njdf+sQ}Q$BplyLS#@T11;|1|&E>xfdcW-TljDyHZ!<#P{EKHX+d=Y?H%V24KVJ
z8rFuUr29<5ipW2AhK6IJFW+``Ech}SEf+F)a-mb6SDxphc#c?xbGphU$C2mnmU}~1
zuX#Hbn_hgjOLW4JIF1^t9F}88y_OM*yeA6p2GYy60ZIolUJQc2(i0KoS55<YiwGS+
zsi{i3q+l8stI&-jj4fM_QOPEP=!3{3q6WV8V3>lv1R_&uy}SF<p&;gwui6?SA}z~)
zKQ@K#-{O()ZjFoocwz4q8ht{F5PDTGzS=>7>VOLgmjvfIFo#Nc*YU4MOH)%5T>fn*
zcYXrs4ATSIbo<Zo3gD@V3JiQo_VC!pepWGXmwaNV;Iw;{9z$9&ml4=;*2UHJ&+SbJ
zBK1W~hC2H(&k1eaT>87G=(F7FX0<I+$3LQ}4c>oZaR1v+Eu-2Rk3usMHUP(TjPqN2
zkfndv_PlZiy<x#k1;iL~i}7GL+Dut0o9l>PhK#oU{$zD+ZE2@+fm_V&tl@7oYd1kb
z!Q9AZStMrde>$@(D<@%YXPdxK^k5d3n|JK-ndpuADLa?sfYJ83wBvC~d><f3P{p9T
zWr7%&^%9xL^TLbPgknbw)wSbd>}|%#OJck`KTO~_0xhK`)h^pBxg^jA)l$4Lk^ioq
z7qUW`(k_iP^an*3v=P8mYb2Oj8{jGwm6+iPM+KIpD#i|mv$PA0b)zN+SxUr-GV3=w
zCFXF>`3J8JY>i-{!NK%#w}10pA_{!LD~T60shO&ASBIB(v|TX1<HrTL%Vtv6Fm)v+
zGRToe7v-Imn-xxsvdM-7QN<K!pnbuAtO%qdCa$xssP|A<g%lm$E0ki@UtuvW_qc5>
zK!|#yH*r^8#yypuZi^-e>zl6wxUFDbg`>3*eVcMo$aAK@Hf{JzsmF0@vu7#}fEsxw
zP@yD)Oi2lV>*c_bj!S4W>Jk7XOSH@INJ-l+1dNrp=2}U2=%xh0J!H-oboHjCRTB<T
zuOf4j7z=<8Ga~r>IxXknS4e<b4`IzUjE?`lm0NsuczF2x@*{57_S(j>$mZ57r(<zf
z3S=VmvZ^J$01U~eE9j|e9bYU?)&2_nwEeY6P_EI-6Y6Sa1X8lXWO~VpQ|L<z*Wd4c
z6*y*kwZ4YEc*Y9;94*!v-sVhmz;t?Ts_<>bBT6xFOQ&bLNBPDZgt+4+c#s7woTCva
ziT&j+*f&2ne*OpS8)@8}uDLQ*^AvHNv)|~M7#0}^iYkTHBWuNh9F{aP0pX>(8+nck
zobbSW%*DRK7Z7qjX<f!Lo0Rs`*$i~1u@hl?QETU`W5-6`V7N|d;nD#<m}2I@yVnqQ
za7K_odzULQFZe%?jMSwwJxLNoB;{_Yv6Gjbm3jQ;1r23$XTolU?8*2C(-*1s3hCI3
z$bsX3Q}h@NY52Wvw4vZVi1c|yiXb(Yl9Uk42a=|dtFde=o>-z;@zrKC8!162Ldr>O
z7G>yV1dPm?tG^;IIrocF*BbGha>xnw72TU}tD+eCtxp<;uLosRBD^9=NFkxRTFwlq
z%&hf!Md~HLztMP1i5jetkz`ZSgT4&z4{)P?F>qpTg~{s`+uo?8it}9~NsYP@oP)lf
zNa=WF1zZ37vk(@!K760cEgw1iZ~XzFPeq4((Ld>%QfH<bXpn;1!y{m%+27~PEkM(T
zwQWjRH4d%q%|NSQj*JfNcz_J7&AK@7!^$bpd8TU{f%#bBsAv0lR}yH$StwJ8%rLKf
zzAuqN73c874gRF5P+s{ms$>;aeCsWTa`DQT_J5NvpE?G1`j#r1U(Dp}l~NG@ad$r*
zwPB@3G?ei(_P-90m13~N!h1QtS3L9V%IFiHTmOA>kLKpCEOQ$lpN$FI{`XzaeaGH;
zd3~t_kjd38VM5j{V67bt_a@2qK9t+#g;VJ6S?SbE9kf9E!o=mRXs#eab3SfDujAi$
zEywbyL^-#e?bVpKqc<(=2(F&h%$mdJv6xQKvOb|!8cFn-*R$@*-r;B!bVRQI`}He5
z;E{K(?5vzk2{2XrYP*j$L{1JZeGZ|K-(TalB_roV61%3-JQq_4xd}qqC^Pxyrmzz!
z0B5tm_7WB~LRpudh-e?mj+c-7cta~;_z7JAv`S(2G$+C!G6P3KF<K~KVUh46A~f_%
z_H{3(f1dxjRp&CG|MbO_5CMr;`J%|^1q*E0);DR<jvdyj@<D~&Bh%A(pax91mVYS0
zI9C3NvN{{bn~oJ8_i?H!BJ;ie;Ljn`Q<)b%Fc#-xG0zjnfi5H25@w7oP-qhD5kU&<
zmO0L_dyMlF<Vh`&3%+dB9k7w3Z<QUCjt$EEU8{~sW`buCa0MH*4I8C-?!m>v<BT>Q
z(v<JPE$M9V-YtbXecC?V&`M35ZV8zKx1eDDfGtbpQtNy&dEx1GTSbh*)#*H!pwb@a
z5$zN*7AfWRZ+>AH$L{0}zZ_rruPaaRc?><OI3g{mQxsIU&r$2yoKSA^FnS16hX+h~
zRBvnCwTc_kS;QQMv&Hc8X+(UQP46SHm%o;q&<EkTk|xY}7$#Ldp6KSM!}Kcck<1Um
zD4HtXfnj(|goZA5VehwdrRVZ%sF}&Z1z8Netg-vY&-?5bduf<RzR&inYp-W_OR6a*
zw$xta+yatN$Dsm{@!J^(hi99TBbG_|#L+7c<8_2gF^(0Vw<HNe&t4^C#Vq$qz3oR}
z6ylYfuU^C<X;($TL^RYTn}yCzR~-^?YKA`$%`&wYC2HbIjmSYa{xu!BO_lNUE<Z5$
zE%e{>i>K!9)P>3N9RJs+0_a6baeOb&-%@z@Nx#Q-sqhwLo;qa0tJrWs+n?2-!7}>z
zBok7?9Cd-2$3hU8Hn*3WZ+4w2DoT0yA8!MTiA6XYMP1DWak)#+yqy_=B%~S@tBwYL
zELKXPJ}rr^&xyCk`q8XeFDeOR4Jl~V!d8Mq`4qRR<LB7Qa^46y3v9B$=$&7*s<JXD
zb#iBYQ*Qg;mLGj%BhL}oiT{8}=3(2C#Lcep@viM^5sw;&UNPro2%BH>&bd4TdLt{`
zBDtjJ9^+5`_xGo+rpJ#{c<oKOAaDerFn(&Mz`kzuEq8U<Smbzme*dU<c75a5ujT^-
z=n~+9o!v<FP{X9xxxvkm&Klr!GS=YVa=UFDAQ~fofa@Me|1LaebU$<U^Vv9WTovGG
zg}02~oUaMF5fOMXKFzZAMf2=v0~tgHt@sIlHkgtcRL$KxTj1iDcs4qK!rgqOGvV}!
zPSX`QATP$!4vYa)lKZTmp)oP;`zjo{_V4uzJo)AhdA*h_rvnVNai9Gy*UWqI_UNqX
z1$}F;$towSSLOuS#~{2SMkkXM6;)~c`(P)6^bRfojcK>R5DTa~mJSX%Moq2)02OWq
z9rqH$V*UI(KR-V^-a>%}r5=*m%$7l)mQQXB$*|-MA{M{MR6}|tLobJyhgV~^qosY*
zI;9_3dYq|B!|?MMi7}rz&C!UNNl)EDgbs!)RW!A67AP(98Ri$;XlT;ixFwAy|GH@?
z?d8yQQ@R8S>N|I6&?_O`5^Dj6dHK<}!~c1z6T5i$BnkBpGXkl!Z@zWOwz3MGlC*7D
zh8YyJL~c@%n3a#}dZZ|bixNtE(o#{Awj#Y-ngjiKoF^7*A=@lz7mvvM6Q5Rg#*mC)
zezIAt7%QV^l10nkkD@tF<{V1;8_DT5?|rvfy4ZY^O~k{Hb!67&=J<m4;F4G-pL|g|
z+j*{)?3;g3Q8mTt^W+V!8#fA=ON-#-z-oo_R*)6;UxcF==6qRM;1|ORptbs^Za$}~
z&}cdoh#^pnBb1`e2GCUU?A%=2>cO%5#ds1uG67E;t8^7ogFSbm0|Ev3{PPo%a8iV}
zRjeNj#9b4Elk~L|imTOc8Sv`mN1eUpng`9CD&Rk3$|q0R=Rc<Eg^1+{07p{=%TT(I
zHf%}WJ@oa4xIEt-aDq_$V5XPiz6^Gt8JL;$-L@ir%S=xboRshNN@o(r?`P4nH8HFP
zWmfmHmN1qq=>@boUGmbCh7q?<3U2j^%z5=$vs9Jsl1H7q=<m#sLftAV-7Y5xz9xdL
zg)A(dw!XL*Rsr7=s`~o%Nw@ItF4^vV8-f!)3tB<UUo|Hm2@c4sq6t(Ak$OOh7smK=
zb12*M-@9DKq=T=<R9G2A;Gt^xDqgvSbx=Z*#V?e9A!IgQfJzmK=n~A?JwZ$EmkFdh
zsp6QceeTn3bY11mYs5+_8Fvw;&vB9`+qq$?*-~7x_>!Tea@4dqVs4%h_~z%48Xd8z
zU%&iXK55>ea*{86qqK%}{dzCCKQk=Q(&ij~ygo%YTzDzGZDS_S5nSK3+`AF5`NdzI
zn=`BuTl53!Ogop|^)`6WaA*$HT6boa!Kd1>%B?MYefhscS!?uOSXY@X^2~498*;K3
z^SkDjL<4IH5hLvigSd_D>BPj%mf~f{5l8cC_5SFNRi@0!e=q0b)=M*<;48z%WqF5v
zJpFfT{?X&erC-DS#b5;j&H!lHm89Wo<GQq(nOg?IuG7*TZnUZ@&Bn3i#<N?ubp&C@
zZF<3IbZe|*duF=#0Wk?%R*cU`i@Y1+zr+e~YYSZ%elZ}@vKQmmI)WX>iS$l)ZAF30
z*x>T=^5CPq^o2%1R>{*l8;5AAeDMF;0dw;>%5#L*=pWgNVb>80q-E)9hzNA=&x&W8
zTLd7Jw6|@g6TvY_CLsd~cbjQxVSz7#;l%hPfWFUd=}rEN`$=4?Es`YAEa8^fRNl^X
z{i-y1^?mT;3<Qs&^|W|rq`u5VOz1u%EjPnW+m5v!M=iJGInDlKgFfYCncju2L=ouC
z>fGecO5;ueB<)b{L2rH$-nn1N6m-BU5NdR3T2@#p%3t@g@})Nx*h@hkuT^+>FN|qp
zlW-^DuaeW1`GRGa9V^b6XC9W8-QLH8<|K~Kh8jwF>E>yOz13F*EFicC#g}o`V9o&i
zfnG`IW1P11AbthL3%4qw#^ZpF`H#a9J70VD6)xZCcwGDtUO!CFq&$Ms5B`xXYcMlM
zYuPcS;6>@Cu}PGL&_pLoTxPtjEZ@6Zr#*#0y~~jRI0{K<D!N-xp3=y~t!N<bf=x;P
zJs<XH^p8=-ejby5j6eV}dQe>K-_qXE`oapS0nW2d7(>|yM$fq6^4T+`nSP`*y$TMz
zN)?w77ig&O;;g+z%1l&T-Gy^VJ_FeYbbJ55j_z%1z*(MB0|K-#Pg0pbo!x)p!L%es
z{O-L7^(i)U^!EwEl5k?zpK;o1^p8_s`RNd$EiOLbD53h#Xn((@^0Kxq?4_OQ+^u9Z
z!-=5qazVS+x{iBlc9b8k=FnZqr4X&mZP6eDo^bn+Zydrr$&?}aRC)6%c2JD$@J1y=
z%|baN4f}Y%8|+Hmr}oh8zh6C%7|S~_Cr1CQG(HZ<LaVdVE#Ki#=@*OO5aNOje@!j(
zQPM`GJIx_#l6Yz}>+Fu@q)tm$CB7xu8z<)Z-L#RR!?=ccLL(~sI^6VQO4>-t21|WK
zd8}7(C1AWk$N{dB3c3U$q09(9EJN5gt$QOEnCJ%ISA{Y)AN(r(d7KU&qyFA#mrF>l
zFg?*KVYeX|`R$wNw|#<f?{b@V$qaSvJJI(>B^D~zHM6k5?J{~J`Y^k6NaCP~!J5L+
zXOho!%mUNHkw|ahQr=g|g%v_7RFXvY`&xAcB{SI)3M25Z*?#%+n8Y*^-G9WuS9%ig
z4=mfFKA!$9D=bm-LE7$GEzfi4Zlf9hJq($`!4K#REqw?cU*g!>PA^P?d7^v!XE3&l
ztt8~vF>yR!+PIsui=LMW+ke+tn<8^J<YwL4{xNW?i^Kb1cpAS4*zVs;3*!E}UFS+m
zkj#R>$!5uLd?MN{v&Wf0J0}JX<kgc>G}+rCDE0Qhk&zMG$8AX{u+RpJkLB>F;HY>f
zIoVm5JmUiK03TL>vvog9i^Lvr%(4ZqDHr!;*T=+Bxs{!bfR0l1cS_8v+)f~<XJHWU
zZ{99{2OSz&#bMXh7N)yyII5ocy+ZXI6JY|vl4vOBBgZreRlJ$FH<OC=hANlpTcb8B
zxisBoZu5&G=+g&da6G;=mA^X{RGTPEZ$;S6%YK)zXc=Lab}u5Yq1>q?&@)Ix3=b%d
zdJIiVxPl5zZSj(#O^F?RpV4kb!eNK(;!pbHYndA0Cn1KTf25$WeOu8xJ6j#wRdUg5
z+lya4ooLdNp!ioHdWQLF4$u+5dTtK&$81d{jwE*VHMZD34oDnM-g^OQhWL-D<y>n2
zW~*xpJZ@G@c4q#7p*7tBE~I$6wjm>yc$I)fes`(yqs8;_@YZ<JVBexn3aZWWf;8xM
z=(ij%5{e^;ftPMXA7?leQ64oEbHh0<Hs~x*nsRrbGns4~P;j~_8gOh-8^xe8bd6Q-
z&y5;ere_X4?)bgBS3^QHuq0K2$6R&LKA%FgafZLEr$LLiq&eO^Pjc(h-b84@jB-3A
zi^DXRkN@2&4`^RuPEJfHbXH4)ftYu0Np&2#jGcUPzrhH;oJ?lokU0q%C3pjw6_(*C
zZk#y#M>`j^tPE1YIgJL5HYXs^J%wz~2S*cBs2I#0Ubv*lsgy@-UlFc$s_)TEm?GvG
zd4I2B_~hTMG$Po*`IBn!qr>4DL|Iz^7A2tqTZ_nLbK#uRb&6SGIa%w1nHnloaY)p&
zEPnT4*vo?C2K;9GL9CN6yz~Ht^XXbwUlPnT7A1kC(;dp|c#LARJCmHG`kvdnWIibq
z=S@@Lyofuef~3|n@%}~js6B{LP%M0llk!saF@8e-Iy)&nw)}_XtLHe9<&Y=7i>F-R
z+t&D2%j#V5v`i)`iGF}!tt!fg=2kQ!m7U|>OT)lv=|QjE7=2=MxO!uyB_L*t8#54`
zljVG<4$vtOkm4}nK=-p}e8!_vc5<ggXeo6Lu`hA_PskM+4p^~1908eg_B*uiq~TBX
zc7*8!+Lxq&=?>b{m#>h46%Bl?WcVI_$DCkogt8Yt5X3|i(3foio+=X~#*4gwy@N7e
z^P=dM%h}n`3s~q3@{=K?D*D)aGYfufY1x<a+wE!4qAe$~gAJ(-d%MAzrk2~C&RC12
z(WLe<r#bp<6m@Irh5EjZL@d5Ac)E?uTdJbr*fokf=w;O;_ICRwf>$_^Rm)dF_mp^F
zM2QzSv#`HXWPH~p=$sW&v4N%E4z#zc9h+N;BA~O0FgN<3?=|v2x^4-{DN9*Kqy$Q*
zuPiP?a0!$2l$8mKQfyz>TcL7~h#DzduSubl00OQ2l2(y8CLH9FG+>gn(aQZ8>m?UL
zDfRbii})^H^HT6BIQ}<gCnTn%NC5TMHOul$J|isW^2op6TH9Z{HdkAwx5pm0|I2gV
zTJ~#`>J94{+Zqdq206d6)wzk7^+jb){dnchNX$m>{rmSfw`>S@{GF{Ipv|W->%-s>
zG=feXvz03eQ{d&zvCJvXF_%yOPYvXE3GQ+4{AsKL#z{EQBq^}X?Z`|0R2J!A*1>X*
z{WGEZ1#z(yAQ9B?T$%81$O9r58y4UcWo;&I2Ie}HKV3*!wT2V1=<m;q+yZp{S@K`P
z+i)V~b-6b1xfGOib#2exJN=Mar+`t{Tal>!LEY)QpM*5eX+};<&ux0lr|f=4v4i&~
zK9I}Zik2xMKSMw;C*P6s<>BGw`&sT8xU-QP^>OPr4L^^MFIJDEH@|K4l|PPTflFW4
z>@CCSjkT$nKF~Q}o8_FJ7ZN$!+`h~<NAe>XFj(oglbUj2;UaV&>ZK8S2`f}zHehb{
z2!}zRkW7hVJ9AF&MUqNTMHO)>%3A%c>VVe_M$@jHizR2HMdBu-H^&<%qwhO`|1*QH
zq|VDuZLb{AtkNAzq3?mW)+r1aRpYAa?fb>Dzi3jg<vyKDlmzfK2#$G9SD{v*EX+Fv
zU=kKcl$J59VuUb9Ngv(xX>)e<E38*-qS*9Scas;jnb`8mKTa;A<u-nx4k?>z2{+HS
zQ@pzz-sV=-wEj_Ug|0-irb)!sLq+127s;8{EQC=X^@Ar}+4!%1mfG$(gnT@mEp3M#
zL~AI_KLhCtLLc?><XNDOBzvAK%dnXqSYXi=Ia-IjDuNk5a80M>6V?~HlPu(>f8NrK
z;we~9Lk5uvR(;OpDaiE-F%{LisQvu(6NAM^`1aTZU1s$8)8*lTfq_$PmY{9G2U8G@
zt?`x5TOIUkTckBI^#jO00>s<xX=%`#b}osI<3|6`<6hFMU$pvZ!qa(e|IGKTypU;+
zvW(E~Ll{8?`wCh>^9_0Vf*dM2BV7VzeX{CK^x~t(|Ne2#p6?d78(U^%|MAH?K^!8|
z=$X=fO6oY|_X;GHO%ysxVlPt~tgai`;ok7lXT<M?$eKPU!zy-jUGBpm=(wE#;T1%N
zDV1d#$G(4a_e~RTew(|zY0sN|s+wQxQA}LvRn44peG+ACGnfjEUsO$s;9T$J%8l|O
zy3OQp(qK>IY7!~QjE~NRg8RMV`1pa;KII|yIQj98yEwe{@TOc6l0Pm3R$46H(oVny
zP*Cgw20n))S!n-WIwC6S56R5RAAnK@%PDW<w%4N0kWg`fluj{%F|{bTj5ra&aVeyF
zo?-vTi!gl{G&^=fr(&Wl3?v=$pkKLc)g$rpuU!dRRh6056yaV{2BJosmSH#1<e=tN
zA^M<!hFiEb>Oo_+Y;Dgji%jl%?jnGIKjlk;f>;|*=jz`<Sf2Fwlj^UPN-dfAoKVe1
z9Bs6L<^*V9jp_>VFql896>u58_j6d0ixbi7e`n5a1GeMBT+AA4^{hOGc@F+5+?T{4
zHHL_Q;qE}hqad571FVV+3*^@P#&W9_cuC}Rd*J+x#c(RM4WjZPEhbhMk;?EQ#Hr76
z5A4qATOVO}CNCWaX6v+@AuX3d>h3IX<UFLDgpvbgY^Q%Nde*rLIc8~Xw#Te)Pa6(y
zcGdVx7huf!(Z}Wn>NrCcI$+`CT_Ca<h2<~RWoqqXLzbF2OH(oS{Bc*mw(*L`XQM{z
zh53#j!(hEhcxkPSule?x-*1G-nand+yIv}bW^s}vy<T*54DC)_-XL&hyj78_u9K_z
zO9#{xA#Pjd*Ps?>kv5_x!@JE{ie@K(wV+mrzrW^R@8hT}@S5&`nQMc}97vW$82M`Q
zHwpo_^L=FnJnmWbYsJg$c*-M;=rf&SE(qRi+90=92Du|ulf~XVN~vx*p>n`8h+dz+
zxwA5uheI*ern*|d<{J-YHq`4W3Z2tarb+!L{HS1&G5K$BC*}Dz0QkFSK8X{9p-j!f
zgB;QkoLeG_d>Ed%B&^ZVH%(4nFjV4|eD4#0FM_qGSdQ)pDd_T*=CqQJQLm2Bxy!ki
z-j)OkhVa0dKQ+wuKWW(L8yqaJ>R_UPg$Lw#SortY8p+x#)Z~gn8)Q!!WDftZ&Xz1E
z59Cb06c9V^Xi)q@{p4dW!C!RtO4QKNa4O%aBuKL)a2qA|(P^ZSi5|*c6!%KHGn`<a
z9rOmNMxz8T)lzI8A$(!#g$7A*ey~`5CbtLlY$$szooEdx>hw=1Xpcf05hn0(s!7F2
zm70<Xs~)gSwt&}ss_1`vSOh%WYjKykd5p%vYG;3~k=#Nm;IYtC#4$g;XE&$%9ua`*
z4s)oGo(`jn7uM?tTON>KZRy(f7|OQBThTI3w_VtWC_+$CE_>nP=q&L^?3u-1zeaZE
zxm9ja=U|p_oYZc=lND{mkX10TnG9?HVuwXBo7t-I<hu`J9|ni3cfj=f*U^wVIMRO&
zFz_-b(bMc`I#l&5#)+RReKJsm;=ZzmiWa3LQKx$fZ?CH{(T56)Zc01^wj3;h`9Pn;
zmua<f-s=D?_MO4OD;623qldcgm-immX*fdh&3LI8&Kr0%KPmZnQbnK{l6?<>5Z^w@
z%V%ENL{#GC^>jX%pFdYbZhrBxpnhU*HPYkCxt~<1*grQccUEr2=xxKFshe-mcMx=k
z$&ZzU#P$p<FT5~ChoM)=HY_N{3m{9x`~`^Tl*;mkoAIa5%53>bZnWEbe(@>QiTg;;
z4>{a*3QHrK6L!f%Sn4-h_x3o+?G><rn%rL9nFa%ufQeQ>k2oc3qcg(LYhNfIej}N0
zXP{96A>b)=BsHy;&Ri@r?aO|@I=KUrhe*KhBx5Ush&GT-$pmwQn1!!B)qZpTAMjq@
z0=L@&qh>5nF_K~C$~!Z99qWYk#|18vt@r1!3CrO^@}zg0T{ifIZ!=(3KmAiu#k0)l
zN2`D25&Gvj5$e(MdU3^(iuyjZ2Ft#5gRvh<GWPKxT}lRLOe#52F`$tEt5H|rD8;sa
zp2*pHCAswXm*=m4H+EumbpJQ!=t>HAS~fcbyFTV486M_iv|q7yzT|+CJm30y|9IBe
z7=%?nchQyR;5ahkI{CJ6CTZjg%F!8)ywv?uJGyBzNdI?!FqS*)*V82QK$%iOpS``8
zH;4%)H;ddXt@h|;x~@+_$>i4TDj+}X{QJI73=cF5%gz27P|E5As4uq^<1IDK?M2lx
z8DNo3RpMa2QNmK4f<#T>bn>mGzxRM6;D!35)xozQyMCyUN2OcedP(dqxppoQ?-pc}
zbP4aBzw1eZe^pf#t?uB#vBb?S{S-=}1J>J%N)}Kn7R&NJ5izc9eeNH3rb??}3=F$E
zVs<Pnbq4J;`9)2fX2Q(W0h!04ygk4K@Mpy!8`Lf-UozSBLXZTs4~c<e@$AbZI7Z;-
zIbhdQ_@Hgg+kITH9lzG^o~g`u9R#>D`Si*f?%>RDc~)07M!hNk9POtKZSR8So?QHz
z%T;xh&2%=gfqC1c*XbQZgg>-!ig!NUbh9jhUgmH{DZBGQ*INsu9d>hErZYXrnmUiW
z@gsthKa5qoWXb6DDQ6p`9?4ijGkzXBVW!j#N%TFH!y-u@OU%`%4z~bJEln(PTAPXh
zaGoc0O=^*()1w?9F_KmsK%>rEsC97-c&{{PfRM>%x#b}uM!r-j(5et+l)Vp7ZcIO(
zrmgzXKM8hW)k^uqjVdw_q>ETfjY|*HFtQIyl-1F`%6x!=oi3qvU+~#$*O5adDHo)e
zrs>1s%iu^CwR&(8PrdjA!tmk?@BLWH(n{>A;WoE(*PcAk)pPB-c5g{4<+ao0_qd(V
z>sdIVA<vAj$K{U(qJ7DbOcnq~Vdk43RPoZZ{|ANUorfhO2f?y?N#eP^@wczfYSogH
z(j<c(G|hi_cvfWForFRhx$nD{^9ZW_lCkq7vh|lAZSkSW10!A-A6x-Jz+OBaoOjlP
z18;i$vhL(Rlwn@%>CO0#%xp!`xbswgp5G!7vBTecMkCIqvZijPK_@@ngH=BRE!}eZ
z8>+dOPS!J_r(UmQdBWzJ+kcKR^iL1jB|r5%@XYNxc+r}?nVam`nOe@Hyly=FRAqOD
zHturjiIWbNrJl)X$GJoPrqi1(V=ZTQ)|Xi&L)-lT+~wXDx$M5P{+Jjx9`Zk(8*OgO
zuKP@P6D;r)6XqvL%eW;mboQ9!UQ^RBw_?gKLu-_`Lbst7Dxq@0X<?~%2n_wo9fvj>
zTH2$70vD+Ef;Be}UMA{d(Rh>8#H|IO$Q=K(;_)dYN-gdUa-HGs>ULc(uiZa`tWb<#
z{nQsrBVQriXg%^C@FnVND)FO%l%fMmhF%&9nkkDooaPY-LjbXMw!u-?!KLf8CE)wP
z=;Nu<z6tD=aiNlTBHUoBpHtDcN*hln-L5x`(dZ8R7D-J-<e+3QCc;RUd~(vrc-|xs
zE4h>?@_1#l?R|4?m^-{6U<6CO%3P^u6v>zf(#ikv3hrJ5+oD6vrbe?<bDl~4t=jD<
z2IIdhh1^P{%6=4YpjA%I$1Ak9=*76lMukE=-H6*QE$wmuJe9JrA!8*|BX+iu{TwPs
zyj?4v8)dXS{`U>+BRSbd%J;5;G)aM_Mr)i~rt-l3n%k0lc))x*j5whU0&M*<mYP6#
zNB;l`)CIiIoO0D63KMOXv;)|Lir38v=T}lqO=%f`nqTbS77!`ef4lBRB=8>+Xs(ky
z@9>5myGt+8f^&~{myyBJisnC-fdf}|=Di4a|660aq>ib>V!O8UDy0&pKC$vZdDQi~
z)|tr2ovqcF_KwMvUY7O{Q(D2)rq8Lo9*Zsken>a~hGb<OtPEuP@hBEM1?@4SD#f~F
zQtX1*w2?#9Z`y-SYVsM9au~t>_YFdd0$<J-8Wgq#I5|C6Dk9GW^1-O}OAZ!~>}*h)
zmdA0ct^`sK6c6*V?`#IamfyaU)+`XxIY#~aR)l$qKq1#=8Wg0qBu2dvg;Q492HOr<
z=_UN18ZJM~WzhW(Wo84q3Yybgc^}1G5`<ZE+d?3<v}ad_6LuL|m`_QMga%X$V{iG}
z5o%^+7*Tu1GyN(SDm@NJA`}9z2@SHKU-kmXQl}Q9SgaqFexmuZEAOilT20~s<L;5G
zf2yLF^(NLKyGeSI4nvu0Z<(TS*}ruO-)H{1Aeo^**lMhxR9NI>^_={{jA*u2Xk&&u
z-t<lKk#264bYU<nRQ?V1Te<=Ui6!iUjb3ZV+m+Tj%(bNn)Ihrxd6i1V`IRqGu@JIU
z$=yR@0adXYNey)VTUhz+{PanY7AiUJHB)4^w&%!-qeW?4(fQXM;*{JdUEQE6Bp3q0
z5$I#Dr9}PC@$&LOV5Fmq6X-dTAC?P5)JTF5un;!=GVYvS$c7Fmaqu}aX%HJ1Hfdgp
z$<9(9#*v2Y`GG#3Ej<Tz&+bB9SVE3xcCkgWw6=@8>x5xjDZPCYSgZ0HOQx^U_Cs>Q
z=fS6GMesI831M$yROFIuFAeilw}iVk9>DCZOvbnqwm^zJW``Ri$u9TCc_v8^;Kph~
zn6=`Q4QT7>(A*>=>?JG@IVqdSufVh%K=x!~1QV8yNY7Q|7nd6B<(+7c9?>2QZ(AR(
ziVltlaeo+8i7cs)gUX;F1n2Q<FfDA4*+fyb1=SQ@N1PJON8D8vcpi7o;;Bx=sia-y
z2z0QOWoa?qtn+^V{biri1vcP#1{T^(1EV|k_ih$&5;sq!PM1>a{ZCamxQ<xHRxacP
zH%f<7uRg``WO{>v-B?Hf;gzrL;~vS0j)=#WJ~G$(kXh*vNg*S7w`knAHnD?EE#uou
zS$uK0yh?6;4T^y)3odq0NA7?4Ub^(S+h@s=3<n&N8pYC?$by1YD|(Vqk!$lj@T#gx
zYG%J-%JjWEg;cgnG6El?Kd^K{&}3xUe@qm{6R&K=B74eui1RYusZ#aXy|`mV^05sI
z0~H`XZ<bf=@1|c>4<fVonm%BYAg~ENI{5N1Pp4Chw8Q8RN#6bTCUD<L?F*VY@ef_9
z!#V2<x0#3hD~2l<*xC489x}=&$5+vBEcI}T%lQ);xpHNjV=|e_CC}?4H~!v(eoc9B
z@>i`bVG42Jj&eb*!N33gR}ZHE-Mr}q$s>pA<8?Aa43#;Re-eg`V!4C!-}(mo4{DjX
zY;OY13|NfiaFnU!pj!5ssBkY*SZL_Sxz*8Y>_ys0xrbzKLm*RB;y}+gvKOgj`Xbm!
za>w?p^`mm((vy-jOAY9Gc4w%;cS`FL>HBA=epzFfeoAj{slW7a1Lc3uGax7!D2`^Q
zJW4Wt0e_}V$s_vhmNpe?E65XpyhKRLUWM4SCZPm41SgqZlOF0zoj|7^d4Li?OhkCy
zq6N1A`CUXRYRQ@vT6|Oo9MT?4zfkXA35r<$HX7m1Z;bUP+hf2uI-fmIq`pgN;EpNA
zSbn!kIi1ay9mHcmq)LWm07LEF3+&ut|2&4{8@kvoIZhaa>7fuwRc`J7fF^6~&+SJ+
zZ*d%PpaApceJj>RWP_R`0-Wmpj6UTO`kFu7%_(#+s_n`3da3{89A(-rLk!}-$WsFV
z%zaqZdWYBxxaq<h1^juwTW06v<%`Hi5IX+8Bn@x0$Of$b9p;3}m+Rt|r7B2cxJ7r6
zy211vJ~0c$)cy#{!Ge<b>lb3K4=4T4ZtDDJMas^8bC^Q?2_|q>lhgkk|IDcgvVZzD
ze{?;8Wp?NYH5ts+XT1B6RSy4qFF?42RrHa+B6muW4S5Zi`qs3Q%ec*dd=SvU64-tv
zy%hmB2}<1r(TTZu6No-a@&CYaAR)LaBN_F1a{P>;Az9G1@n>_iG-o{W`l<2dwhsdQ
z1Ph|m41(m~dg^%m(sYT|>Z(ZjG8Qh8x%PeKHjy5+yj*_&&x%OLPS@k@wXLqM%FOF2
zENmnAM(Q$kLPA<7bwuHZLH7)!b7p{LzR*>{^sL}e!I`%9)>};N38XUpfO6LKPjRUI
zqnkNxWpk_6RPo5^I0Q_|SV*BQVs-Y{uTg`DhzMK0uD7QMDK#)jI|CWU^g?&T?SAat
z;=U#ty*s-Y&%ba1d)Ps3GO6KDD;--J{bQ`52DkJn7pef<cC5yA+M$G3=*Cai>312|
z2t|Bf1gnuntoqNV4L?u*=@eUS`5*M_mLWfq&Yozg^dDmoZ46-%N7NkTi=eV}tg3%c
z*p<^-#+-C09U8w5zD7bk000WoW&Bt|&leT>GAHr}NChe3k&}7u+PhE@|I0yu-*?K|
zZu$^&*0Kh1K{eyYKn0S1=J#zBgN1hi#{%C9rTEX+^>F2q5ONr|OP*Ad5@4t;Z)9QY
zL>soA4!av7ZOaVj`Bw3iZ1M&Zt8>&bJ1a+(UqO2A;ruN_hB*B_syoar>6*;3=ngWQ
z#um(BG4y@u231ejuUa9;C&t?%*URP-|Ewr-fIbNA{IILmw)Y4^iVC~u>uf#kP;N}1
zPtHxYZX1KCUzm5k?~oBm)JRcLRMS#E_Rq@Qt$fl<d6Ga8YSgK2q9UN+^9#oH2JWEG
z5*U5-0-7FRjPZtQnsgmEj0#^`?yt(Krhs7E!s{EY%i&<Jl*9-$lvzP43v-RS5xfW$
z{C{gnHs8&Ua~yAvOJxCuk132_5O#8z*&>;h&OQFu?2u0ZQX%7#^_V0f4)%(yu8w<-
zD~CD_ynw-BlMyzx+njgQ?K(A`%Ekd4`(Pd03@bXLOa+adO*d=?5pX&qc6NF@In9%j
z`w7>_m`)?1kBK%&OC;R^dPDI-0tI#36oB*&L+gyo1m%brD&JdcHB|?m?#iodJ<uO~
zm7H<sJmdMW4=5$pSzW{mDdjN&v^0T7OFv&Y@rWWd8%lDy0aC<(X}KOSQa-FN8#NXi
z3kA`Vc6(v5Wzj!|D!=v~6jSRa(_we$)fK53f=Tl#Bp46o<&oLDMwao<iYZj`YYQYr
zy61{93{yrG;E#N=Yas!~2vnzn;mku6{n=MtNG<l|EXqqz@4(_ari-PApK#2Hq}S?l
z+c&=kVqR}$ph&g_7ZpYM^|cn`$p578pBzVJAqhP;*Mw^xUYD@PJ~cW<5l{cKMY2jk
z*;i20SACD6VWD%4ggNlUi=^QTO1y=Mg4Aa<!mwf>>cQUgX!D~ChUp0Hifw)IkvECl
zDOMatxu**9Z;bgOL<q~Ca5Q#sgZH`fjNuS?UvVAFhrhI5#Ab>W<FCeLtfoN8UFU@p
z3xg{cCM1u*WN&YV1Wc$ZfE#ZNz|DNr9{f+N{aa2LebB**DkjnkFjX!G>+U>2csb1k
zVlZw6dhKD0-)1}`Ks>wR9sL^%|3RP#0zh&b54VkF0lwcWt0;1Af2g$a^J4#^yTC6G
zq;td7lX8AQenU0n?ar2!y)AbpvKu?m_&dw9VE>(Kxs~|Mh7^V3Xh}9VBdyl6#|8a&
zpAm^w%C4Z_z;)n605s8CKsifZ0;L|fS{-v%cJ~2gs!jvq^W1~g77$}YkJrqBg)1o-
zQr+W<rA3|k8>N@(btXTw<pEucM{fZ-G4V{A2DU>O8&-Ee>CbJ;IN<x9CKDA+3scfw
z7F#f-&YNORB)lpcRl9WSo<d3q-q4!)$-_x6>cMXBKB%lX5SxdWk&~6RDG`N1%xV2$
zmyfUYD-sv96{qxV(7sgd#t63aG|8dXlSd(_vh}158s|l0!xX8nxgPSzvAr^C9-69D
zanCeN`ZLMt@$vDWw*)MIY%otfh*LfoodPkP9-b|izMDPcAz753xaDD(GO-D=)%+ey
zhVj+&SCqTSmcqz8z-jFY%&2FZjy{`ScrhmD`DQfRrS=OS1)-D}yI(-T5|jf(c;jlM
zl`Qyql`OrmNF3hl@AvOLD?x`?i<_WR0d`X6w`$se5r&dmJYF#Z7Au6&nWd~D1wIai
zctDhcFBV7&4T))-oTH}w6Ey~dF5-mKAq*vJi2fn00I~sRAq{ArAw$w-qM7EZIzr_c
z?Bvt7*wSG?8elM>OO~EJ6EEl)#)aPElMg+Mf&V)c;@)mmvR|~OPB+2c-T_M`i@uBS
zr7Z#4zfed7;})oGJAs9=5i0&Kg3gAX<5%aSaWL~kUL@mANt5K7pSgj#p7$71(RZ`s
zi^yNABNTagaD<Y7)8tx2zT*YV6LCl?1-wKm-hr{nM)-kFn6ajE!c9c1eQd0pq?eFb
z)arOh$97`J&iYVHf#-8NP~VnGP{|Bww^~)?r&MLRlhv8tii$!;d`d9CLHSFxOXmD>
zu<jUpf;Gg3*1u#z(zJ)`JEoz!TD$gcC|oRc{vQN=xNEKw&u0vkQkJxb*r1$v*6hI!
zfN=PgB9&9gJ$1ia&8cg<cwl4iR2>$w>mu1bC+}UnX`XhIxnJWmHBeba|AT$)Z@HYQ
z<>~5+@GmhL_P)YN2zq92c^XEF28!isPlb}w%VGR5vAC09)F=nA1w|C1@<_v<I5O2i
z;nk?nv`xMJMA%%bc#zD0LSFj{9I{)+WFDy0KDljdEa%~piS2({A%~w>$<&gc+6A62
zx_LUjp9hF9DQRn0wU_jx4WkTYZR1{yy9t(qHU#4bfDnI=Q}*b=ar>*<H<p1h+NUjS
zXUfDz>H>+iHrut?HT5D6G)~3t5G(Ox!{JeZHLjk=ar^Fe{Um!<C{IpAA(c-@EyhL7
zVt9cRHST6^$->6ZN6S7En6;(HG5WK;GW$f$HS&j%U|o-=vAH^RBG^7wpf)x!4I1E6
zJZy_oBls%}DC4~~H5)){wft!a6Ho9QP|#HN<q?ZjiUk343JSzjK`hjeZi58w6A(W%
z1jX?EuVuU}h&bNro1f@X=a8l<JaMq^6&=m3vzrr=U9UBTHKj_>aWaa&!iu8v?Gg{?
zDgR6U{qM*#9C|JLQ~;jlO1o->N`Pl%+ea|}?SBHc4$`!EPfNw~(ifOIobGMBuLe&M
zB?C$#c>S8X*&6ea2R@fC@g?u*n^+j86sY*#Y<ArVzVq4#!H?xu2DCNOR#8V&dubBJ
zNzegA&4Ge}$31L+mY+Yi4|<Kwh<`Ww1E48mws)*t4rsS~{IZ>nTKaJn(4#m%&g?l4
z!cGRiS0A5=5$->sK@?obPIY=%w<inXDd<0@-d{m>31CYsY|h0Xc<bMzl&JNqDGtTr
zf+*38z#~4KoJdJp_GoO0crkWE7tNvEznB{s!^q}xxmE4qP2(VvH`}qZ1b%uHgb8*X
z0xgFe*UP=r0JS#-@@8mXf9;c}lT=DvY6K8$N#Qr&O%eiyP>ggU3CXdq!FjcYOq@b`
zk2G}od6P;dypkT}k<$m{bw=G>t`p9A2`XxOOmU8JIujK{1ON0Si`h77G%2RPFC&NR
z%dBly6-hh%uTB{L?EdvZpL;~OLiYtM3b*UxIeRQR0F0XwSom%nk15>4`7R=z75gwv
z!gmB}@FyHn%ea|R$6g3!T`Nq<ln*swP;S}0O*nb|;-c<M>s>{8<LfK0n|Pl(6wgR!
z0iL#WL*@`L^x(?#@{aASaou-@hhmP#Jvnc7uO`!?mF+}O7p*qjOBgiLDZ#6vckj9N
z6Q{PgPr&Gbqm2fYHxOSO^WoIZ%PWP{eWImep2{ZxjVXhC>&85D*7s>#dh6sf9QME!
z+V@{NjMuk_RHYq#o;XOQ`x9lTQp+tW|Kcn46)_C_X*5M9s6^T6Q0ZE)$iTLd=U*jT
zU8Ry&{8ygoE*PXT{m#8|f0Skp4kgcvxen#8a1S1kOO=beqK{E;rarFw_wjsxyWEDG
zxwog)?)zzqB~Eq&=)<YhZ(?@M9TC}_a|CbF@0QuGo@1+R<c*v9z>wnA!2Te>qSQmk
zj4*z%Y|1@G<pU|}h2n6^QGm$KS#qm*SN>FP16(D^09f^7^HtagHjlXRZvk|(HjDpk
z%zk2(Fh7lrwT#ck#+gjwX;{t2E06t@k>`$qbtZrrXh%zKh|Ha(q;6K?Nf%k#;H*_`
z#T^!?Ns!F2SI!DV<24-1w-0vw^PUs|@ve!<*&rm!=V|<E!yjp6kJvK^gH16<ejjTA
zDFqJeDoM)0WI7MgX{hBSc44r|{<`R+0NmWT*a4o<;3l5f27LC_Q96x$K{U%7tY&`O
zgE8fjVV%N<CZC8vwBY~9yQ=bNC<$+4CwCXodmvZJ3jJo+6)~wxTV_VcVXV-apltcH
z`ofe<kb=HJ#?xzzYsx*f13<^)uxo6aXu~IO2Zr3@&ZUC8vn8GsUC%S~+uh2obRuk_
z+|b?P7+8@qpZe<;hF$~X<6toF*@t%K7Nuj6oYOo~{-X^|&CPodpyq1-?VC8yGY9*l
zY8sX{xS;OuVi;3J;r{)+mZca0YLSB&^_yPOkwp>;N)|WK@J_``bC$ygl|%`GVnNB-
z(6961scf8@wNa9ISEQj#+?t+(e2;n<{eMMdRXRC6uXu^sou(?AqPw{@wLQDN_^Xy+
z!0S=5)P4!VD}OrMWl1=bM0$sLsa*gie-@3HtB1;me}R830UeL9+Y5m404{qs4G*X2
zV#)($_)FPhUeykVK<K)J^%`-xfF8;r)l`g~VE+xeGIQ1HsiEKJ7jARS>}DF|SLZhQ
z`SbM#0&=MD0-6&h%)w3ieGymnPC`8SMu9Yq?%D;B&{;pxoRe9X?67Ec%cPnEK*sf5
zYA4k2a8rj;QSw?g;rUD$$h!HDiFqVjV2#-|?~SqEG%<NoF>K$9ZWjSjXRfDJlj%fr
zENoKS_3U4h$*S;4eSaZ%n*6$-;p03b!HIne-ZKUTA~)HGVik=rZXfcaSS<aNS50hZ
zJ0Q3XB{-c-q*hclfQk7~Wn}|PTiOe&0TODrl;1;v>Sgv~Y7U(^0r&}=-|)@%DIcIi
zUI?{bb|9kU8{=GZvP4o4wrJ?i<yO^c{uj@6MzEop<|m8msJo0TlzA?iiz~&+!%F!G
z+WH!K-V(nD6V6{k9b?p%TEovh*rDh-Vg5b1E&Jo5)6f&Q%KHz2E$jhN_K7_NH#UL<
zifvSBMe<Z_Ed{QZ_A#yYB6|!hDDYW|IPXJ0WYYoVORSZwgKYq)p)aDCrI_S5NWDt6
z(s&WKU(wk&Ud%{^>XNC<vA3Lp$sUl=Guip~sjhg*X^4t}Hz|6kB<g2Cp~x3!WwMT|
zDr;70j#ercafaNSd}B8ACMm*u=l+C`N6blnuOuJsq@>$}Qo4r|Z9|tq&5Sfi4RGtY
z&6E=cZ_jOyohKPh?ad%3xm$62;lbqouiH1P8Mty{s?MoKSyCoGXOz(adLh%>U+QIh
z!E0w{K+f{e>0AFJdkPnl=Q(G0<`;*{4{8;}Y%Pz-O+-5dHJA}yJDjUVE3MIvgbZyy
zM5mY+h`A%9rdvZsY%B8^<CBxtOWK0}TQDPhcisH`3GBTP<*T$M8yL8jzKE-i3kh(0
zJQF5!UO0THX}%dF!2Gyi&7vhPho3fAj*rEmzAMF*Nu<zmm#KH2ES|n=s?bB;FB((;
zpkOU1h3@dSJ+SR9oZ6xHuG3Th*ydi>Q_%@dC$c>G<e3c{b)`<Af}=xs@F^r3gwcZe
zQ4F%BCUu5PD3y=B$AG<TeT?Xu=Bccxk$-uhFryQPw)<b>!f(NxlK7<2%H<UQu{~*&
zrG7Cmz*iPlus4-A(k~bAu*oSgG1T3w$Ol1a-+UtK=utaM<UJ#c(v+FifYs_#hiy@2
zH;`D3-z77-*>G?NV*G&ai``v%NqQ%kTW!`&w$rF=ejem>H`tAE-)I!a*02QMdPp?`
zz~)W}IAgQ3?!h&dg!APRUj3KHcI>LoE&eIev9$#f_p-9Gab)@j^*Z9Lb}ITD4I~lr
z?t{vmbLPi2Gh~kpQSoc_8}Uc)-&~t+Qv*hOC-zX|NO)V=@~`2+!4%IM?cz;En~CEK
zH@{qK^cHR+x6d_hKVu8<Ld@&Jg(v?UG{SmmvlR)nvf^wzi?|DFXim|4HUj@8gQIT)
z@gw6=W)`m4(tc0U_lKK?|Hsj}$20x?aePFLR3jBqOq97c_eo(cG1o>e%cU%dxrE$O
zZp|eTX)ZA&X1R>ymV3EO2uUX7QZD-<p}Ajwr(ci$>>t@apL5>t*X#N0-EPT>3=)=0
z_r+qk5Kk_;u!l9Xb4uKfv?KILFk_(@BN|PiGl)u>tKFO1>a;0}x55ZH15iE*obzqe
zmQ|-oMnCKHAWJx^6~BSz9S@)LgYRS{W|9;qj0aMZo|yCOeo;nIz-(hMbhm4NWw7aY
zT_ed3(;*SyK}{~<aPR-cMHTHtMLR!3-nVZLj<@6EyBlTxpJL)En6zd(Rx0<%<LT#6
zBVNd-$^CW*eS(Pup7W3WhV13i{Hb#vUakt^CIy9LF|ruo9)KV`Y`iz$gix_=h?~XN
z68NP<s@`|L_$untt80)qm23KVbGh=}R_?dXAbE|HnP1@-o<I@?(-ZliM6vu<QE!(3
z&j>CO7O8*IHXNqKUZ!4D=VE37e1SNlXEmf&f4x`y7DJ0`NEe#l2ncA&haKI11U2ec
zVr&76MEvTQmr6^U|6|Lw=-s`0-$fE8?<)*bE!VbNT!rEpXoK-L85u<-^YvbNRyK(I
zA4=VBHu4st5f|m#TTG=YM?E)Bhx+<nUU{r6$5cOSHmQ`)KTqwYzbu&t8{O0))c($9
zdBz3R&)vX2lWl+)ZwfArx2G}KKdFWYJZOGR`r$r^iAcOyY}KQx3tk`6eQp9#o{8ws
zw}vlk?65TV2eB;HDS61<23w<B;4B}z^{b5qLO8DD>zH|PA=#aA3@Io6*vJ>b5h}`~
zVk)fWSKkh~%gI+qPnOJI8n2nX|FzT3J?iPa(b1KyrRDCB(7^X~nrEN=mOOzzkXfCl
z5&!(Fl854o4x(nE_Lj{+EK-l-jV153SSe!KvAX`*3Yi>!y0i}TxR?`d=UZP;L>Ss&
z5hRPoRFMb$Pst*ebWZ5#;`3}sm`ODR0OOa@VbO5tSIkG-(-Nd4j<-Zx?r<9;V~Q2~
z8>Vb&D=zU3AA85ug5zZby)bd2L=;Xs<3Czil`@J6q#vN!b-^sF!UK@*;I@QE{l(Tr
zw+80hFHKCggC+@r38Z$a*9>mOflT)^ym7|^>AWxK!w4+Y*MK2~r}?Td0Yj64d=M(!
z-ndAsj6t+YP~2$};wR_5Df16=ldBl-iWCOLAX@hpxXHghV$EI=ZIY3vtOpsx0{HRO
zFX&FhG=S(z1TO~3=2f~0HQ|n6<|mi$7UP@8R##tSP*t^l?QX9(?SgD%&2Ey#OiUsF
z3*{2GIJl$)=vrM|p{LPNSrXn`2?QbdM4-mBrP`H20g2={1yZtH;bFs#UKs#5Rh_As
zwWezqIT#&X2Zk=kwKZ3?luXJiOP0G)UFL{8Y=<X=?GrKAqxh_*BrV0i(E}Gr*xwii
zeTwwlhE?+|U0VyRXZykP!Lsrf-;CvJ7TxBAj->LXTU>>Sok7yX82DRxE~2O?jbEw)
zZ9DH%0;cVS3}g0YKJY|DF{@@H`JjP9?XzlUk4BG-j0D*VEvk<AjZ|Tz`9!r~N&L+M
zw8%RR>d-_!aVR9*0CnCYF2Ee~9D|PJ?{E2Yiz^<vCJ)G3`^$q(A#1-8ynnmiRHn}u
zaz82?-O5+gKMv!Iz#^g&#UNa4HLch=Kq1a6`*!u`xpC1C>FnNT2cIKdbdQ&@=iScX
zmzM3((O{1X6upLs41KYvm%Df9v$dQ3^WZaV21#`}zG}yPPG2Ban6izV#bE#5G~ZFH
znp!x8KPR5)sN>+YJoa>3lR}zE!a}^4xa(p>LLXm}6{V=P+a^wk2SPws+JjOYbC4s^
zCglUij6g19>+j#Muv=}UUrL!6Im$Q7v?Oh;6dPFNX211i;G8q<ORaDO)6_(C+~oVu
z;6-^36(cFruhGq(5}Sy<KWqfgOBq>x+wj9@y@S?rs|iF+Sqg}A9OgZU{BNt5ugcI#
zmVI@THY~Wr!5)c}CM;aAT*03L@}jsS(rDmW1=iL_yK~cDRE%N~fU1==T;*Dy8w>E#
z_cc(96!S_((R>m38!JhCgUZau+oUu)+Pjy|cn9t7&I9y}%_^u{I5a*~=II`op1xD(
zIqbi_WZo5_tkhE<xa|JlC5k0Nke7EpP#x5xGAn1W)s4w-u8fE5{at3Qc6V2gKT^HF
z(7wJ1N}v@ozEXyU&a{9kk97Uh%DvzV)s+;uJ$stqA)ohM1fZ8Z)AL2FA~<qlBp!za
zakck%+^rt7lPs7)7%P=K*|?tLu%V?_dDXR=n$b^Jc?>F<fFzbVrT{hI<M4<V%5*QV
zAs{d$JJ>aYpT!m!A^ZkKEB_$xa=LiC0os7Z5rdOj9I4SMJQ37x456HS!p^mEmF(+U
z_e?~pWG6ppc)_1??{v8Rpr>j^j!2whrKQ{#|2kmqvO!QgA9FK3WD<QlfVg1_^(GZs
zAjWGhcfBw+&~vz05z`fW{XnrQD72Pq3a`~=BA*rbFlR4^d6D~Y&^Ce_?5&JZhpooR
zMvUROy4lgPKAgN!4048~rY&_(%aaD@xXn!b>M`=}VXEq(F(LN`My+ctFv@}KcXI6h
z+DPdB0@#M_?j8_xrPX*OVR2pWAEJ0h@6>(7VWGN~Ml+W$yCky#zoXSegXp)2G~eV-
zK2b(-@>`swHfxq2oq{Vj`?L4Q!Ep3gL%C<Vi`TAOLlPkR*0bW*0U9v@dBLGhQR|u1
zOj5fykI_ZOTb}o%R=P#*kwZ`mQT7-FGl}W!?GFYnsP^J_PrLO-09e+PkPG0(jE8;*
zgi%{8f2u9Q-smG5Wev}(;)Yawr8pHNk2`C<-}hPf+W(ueUAK+J>uF>?J9g|+Vdd=h
zf`3uvc_J8teZ;b#^f96(BROoi-=$@Qe$axUZ?0;a7ECouw!!$GZxlK1tSs*RT+F@_
zlw_d>KxfCLwev{!<9?4^Y-F#!OTJ?}BO&wpH8`mV&b<F8JHCAPE4u-=Irf+B<ULPM
zXaU-r9#d|P<prT^Ns1dnBh(sLoX-|rmvK39guHwPER`{cPf2{afw7Vs7>gum+*duz
zJVlfPXw*LkQ_dB`4nk?W;}X*lDju_OSM$_oU0q2fure$JBDW8*!vd6Vxg~qv1DseW
zJgpDS6z~#F!j@Rg=Pv`=l6{YJsK%1PEt}Hs5&r6#;0yrjL?pCv4Zm)n?wIsB&L`b&
zt5Y3*9Up_`BXKF_ohvJ|$0eqjy&}h=@q?=w-NU&4jhU^j+U4$__jur|IxeJmBmyy9
zGJuKZ4+}FeKf#%rR35N4TYdp&nCspq6<^f@g<lD+*<P7G%3Gnp2*g+R)2?9*;NqeY
zzE$(X-FK$R62+n))XzmGHNYw^3!8zhqVrGu-G(s(`15<|cT6m~|IW8}G-C?FT(d5~
zr@(}Hz$@&VlAKBZ&M$ELUmNj~mzSSZNT9ZO2JCOL+F>)@h5H#zn|iXKSlI3zvcG3<
z7oot%Aq!7I$9t(}JjJ!$3IOW#{$rtg76{v&wQ<dmot1?BmHDR4fpNQ0jhd#03}#{q
zJDS^Ew?gxGigQ7XvGoB6C7vJ)e<EtsjWkU4#RR?7=?`?~zrYg2kL1%RJuECC$mbCl
zQ%=t+9b1ntb2*HJ?00LFEcKkAm`iK1poJ$p?-nFN0}o^yVX-XXk-Ke{^CZQv7<eAZ
zDb6yHg9~)^AxZ0Bb~?$?K_{ehV0@9Hx>m()Y42<0C0OI`aSqK2@%T<dgO8EU1%iBd
z=br!0h)GgX6pr)uen`GX|Gu!+jLLRW&(se->%(_N{m^@XWK5nwrOG2sB@@=b(^SS#
zznVHT*{X8NPQm46mNMafN%YHdUx#<+7#Ne0!t&TO#H+g%Y#gRDpbqLj!5bEXk%IG!
z?Y+-FASz1YtIDYKwIZD3tPkGa>ZgvFIeMl}Yh0NYy>R7K)|E$d<V)nEVb=}I=BzVT
z)#;#U5%ZQm4!MlAAmmmQD$BFT7_=^p3`w@IiiISl+>ZbH6_G=N!72S}>@fouZCrGq
zHq!GZWEL39W@4pGETFd>?zCA3ibY-Z-J|hixdnA%ATrmJu>$ybWo!pY5j&uN?phBb
zYO?<GSn#;gsrl~d=@VgUXA>Z=t<*D3CppHp!7xuRifYzh|M~DD@5|R6MkmJ>-+iV{
zC0T5(kMC_~H|>w%9(mU^g!t^P%&Dh7WUaXNS8{v^Ta9fEV?$`2_K2G%-zcy&DVV}a
z3vza;jgDWIVdW*y);?2yAWCG+%*+gp?~aZ)JXlqjRs*3IAx?vN_x_5(K~><v_^Q#1
zg(CD${JO7Ut_`S_+|Mh%w!D4E$dTOoUH(hK(BPm|;nhlYJ}l+xdlHAFC<pJ~hmA2}
z&o?rf?**jn4&ZuUA5(Adhl@Qvn4I+0XT-ZvI=RnMPcHJZ3*>_x4_m~5@Xo@Xw^~P&
zM%@PrZ0QZYSn|K$clFg3i?qHZq!pC=^KukPhr@`F!XLYttD=zjb?`E%#0|twf^JX!
z=Ptg-NPX8`fO&MbjagEPXDKo5g5ukcNHTD4v-j(UiW@g)q+J$(!V=7LQtW78#k;lE
zCEF$P;3)mPt+cBLV;%tRJVs2rgwaw3Ru6yc5DejM!j%;UVaa*eO3PdmpsIu>iVdni
zV}XNW-H_ehzjOR(%g61gL??EHINYibSJeIBQBaU~iVyEyiZ+ZF;ml!xrzxwn27X^S
zscJS^{#X-~eoQ9GzPK4ZjEOFSiBsgdPv`BsQw~n)FIn5`;2~TGc7$%eHJfuqrDR2r
zc08RB?feqcF{Ub`Szxn%Y#YqJUDNZ37tFHl96XczfVtIJTjTdLuowKXu(-ZG`se4z
zOM}N+QhV2%_IClPaMY~v!E*BREyw*;)qX_a;95%|i4T7_gYKz<0I1@;WU%&kB}O!g
zqy-3rG}_<BvM~nE_ooNp*YleQ&NqL}!>`|d=!HBNE79ZrO#Bu9IbP9l{=8TDq{Dn-
zPgp-XBq?M?UFMY2+S)rKHYmm!?fHg+fjGz*$v#PZ%g1N%QTs!TcwS+p6awYXc6OZi
zeCO$0%R$y;_kg9W1XcL<QsXUa`Na=hlHi;nA#SQU3sj@Q9Nf2_i`P}*wFU!YPY}k}
zTij0ID~17BOOW8xN$p2ars@+AS2TW{!$R4@1p15&=yyHhND5J(R^O<^#$cu?q6P&I
zCmUE08I5NwB|}QB=S9XvH**XddLW(45Y%7_!E_MM10S|hv5^v5XaMc$im|cLvHiVc
zq3;{&#uv?Gn1G=*rpi*N_FzqIVF<Y-1`Z?^2@3<$Yjc0m5Ri?6%X}2@h*KU`FgmMs
z@!n;+lF20S=^e%0$eXKZsBLVl|1L?fkhIa0lgQ`L?LonHp*d4u{T@wTh8rR{pl2Vq
z3TQza8z$mPfaB0~=#m#r794#7KTA?3ewPIG*sTcUR)U)<{6j^R$kxK0;Gm$*-?QCn
zri+5tg)C@o>dc1C<<C=Z`<Imzrcq2_ai$BkpxcWp`{+;v8+Z06#?U}v=<?2{S;*gh
zS3!Ko{D9P8I^j7+&4pHJ*Zg<G@>_%rB}s;QRZQ|OM;k2Fxq!1lwdJB^p0_a7TR3XZ
zYoArR_p8n;9^0=#fw6ViOlMlqxnT&P`ydoxxf6*e6s??<dw<9I_SBpN>6gSeTsAK9
zpw|^9*tj7O5GofL*ef2%I2bOKWg(1sOlrNt7WiYl?;P*rCk@4=mY4xd&21P?sw2?r
zXd(&Ar7Nno{uhfu7-4w!oVmoR9kGZ#fgd$ln*8C3<OTLmyoN6Rx%}(~i4ziBefXuS
zSFuEw#LEQTkO~r;mho($I{bCzN#adO+mD46&%j(N1*S5;`k>OEA<M!_J@BezKwHmY
zTo$_j9kSa(s@KN^gHdLO$A5`j3E_CfP%XM8L3Fw%Z#0XM9sM$<5v*Xs3aPHnl|uDS
z##E-@B*KwZRvh4!=9_lq;nwds&xUzxotRk=6SA4;0I!d3XVW(od=N4gYJiMtTm;dw
z#vym!`JZpI<Hw2_?hWb8NsV4=zW`objJA^68=$JX-cyGy`pU<v4I#LsUSyJB$n^KP
z)s~BSq<AS~3%ynt=tnMkdM#=`+Il~ialx!45*7d$3hM!ru1bv6M@^&dawT%nAp@WQ
zEJo2-IC~`|Ff^b#<AT+9)5-fXh#Sn3>>n$mOH|E<#=78*qNc5(7Z+4D_lF#JyF&M?
z>*}_5Z}qlZ{?~AmK4D=>0cAvywM%#!vghs%%Omg<Y;hZT&Q&Ax86NQ|sIcW?g-4e<
zy~<;|v6u1<OG*0ibjg{%ZU=onqD;9zvX$fEk?Sl+vGnLEyht$SMRzs_eRK(W4zM96
zldV%eSvrX@?y##Qgx<*q&{sK&p!$JiyAGg^-z=oBNxMl@r1aRH^qeBk1UVgu^DUV|
z8-TCp33#$Oo~I`Wp=ZkVoP;GyT<Y~TfYU$y(pp=MhQ)jm3ikDVe1fJ7auOy85K)5b
zb#M|j;;CSieF(pDr|$XJKLOjY$4L4nkQ8?QSTK~Zo3HpLF<9&q+i{mdniBjJsBYDq
z?@F@5a2M#(AQ5SYIlp|vK{^EN%`qPrQqx+5b>7$1)P(%C<Swfle*|Dmdm+}+Ymcav
zKTX0C(FJ2c=Qz$B+s4kn$jAWu!Lm`$Wb8j!PC%;9R)<+A+Td87J@mQs6d15|;6_$6
z%Z7ufMDG%=TS$F{jP|>Fl0JpXw;ma1d!<JO%<G>%c!MZVp~7ROuS@uxKsfcMFC4tG
zE3V<dkLKBl;?F?w0Xw?=U=>=bS?KzGh7hQ(J_R2OudFMa%`HD+c%PnWvX3?eG`v%q
z)rPS5vS!FWV5BetY>ov*0~m~-+HZ;_-3D<&j9YtW^1nJTnP_X%GX~3~fd_4`&soK2
z2<i7y8%TTv8pmsPE^{<DUyKB=G3zzSbJ&VNt5jw;rRJ8+EbjkY+}{(~JH=d3GwD+5
z_J#f0;f87V1ifVTtNmSPiz)WCCm6}<UWZ;I*r!CcFfuQnli7R+LF9dV=R`s8Jl5ur
zkl-q;M6=P7&HskhQl%gRQX#_2aNec*Y43B9y85SQypypO#uBGeQg=R0IQK6n3??&#
zVlbw}jofR38|l~Q$oHc^x!v@f38`O5`|Q8B)5D>+nmfOFt<NB#P;V{vNxT0Gh!%Gm
zz5m^3;(MH<oxNj#j^?x6G5|O?lB*HO&15|qH-g7`?|undjJXFk#U>_`jQIh8XP^T~
zo7fvpLR-E!y3<tyxLau+lUm%`uSxGTg^xxztT!$%fF5t#+G`S@)BEj*A{K90Shq9O
zn!M)iI*HH79!5-8cN7OXUxL2Vt{JkwHhQRlzuI$B=~+LAWHFNY!HSX)4T`>?xV^aT
zka&~5TgjwemsSbBSyNkvG!6qk-Tq4Pp4+q}BPr!%thBeQg6VPIytLv~+z6Oi(LBjb
z4Gp`aiy_%qnIKuxNoIfcaEaH5XZeuLylH#?=z=Mf7ppFO46LY!7J(jp5tyJAi&Ypy
zxU$8lfh~6FZr?XH`_oPPGi^^t2TL3OAMx4;g74G-k+IcB>jT#E7+LD1YeP-o)=rV#
zT!w0?$e%@GA_IkRj(f7u!cy?6I)&-DH6Tq<v(th0`Hi|&cA^XsI&la(NdfZqSW<Si
zD<H3OA!qFJU>-#j((m=#U9TC#HZAy<xNA~Rh!V`SuNr2<cW>S_L0}CMTiJpWj`Bia
zYu>zA!a{9GIdWKlw(?Hqp=b(r^V?l;5pH->33B(&&*0qft;WSI|MIcBxx9eWE8q3*
zb6l8{RgT|q&?)xABFC+FF$MV^wN>TxyU&Er1Eb@A!RW2O(?)T9wxBy2b`uA}`(ysB
zfb*%wx|(vqF%XS!Bx!S~p<ydlp#Ox$X9;_I2O%1vgZZ*Ck@-{JN?+vO8Q3|=BPR1B
z`EXHj0qP6J*49=Bc#OiMR7=NyjE_?+6F=eF5&6E*u6I+3d03*GP<zOKKu%7bU%hjb
zHLvP5<INA5U!B_=VA;)&RC^tZ-qn!*4RZb8H=CbjEclO9>d3ySAcK9g2I#v~ZC~S%
zjl9VCm-0U7d22m-_npjMT&;obn`Bc|xj=lvO{;A74{dF2{TWnpyGpOGX-$d(y*2u*
zb?;av?e+F<cuaaeFZ-28Tla&5PvJk+0Kbn?Jof9su7>olwRshqS<k3QqMng5Pb#Qe
z#>AC@7o>1IwsY0*63=C^cxr!zQp|P2BembWPmxiO%mA#RsQXCO{H1vrCYJ$%_sJ1U
zcJS~wY&$2ZV%^3MtVJNe#ZQ@5lI2LV`1i#7Yqr0VFtVT`!vf3gx-(*Kv{dJg8X-d9
z2h$h{CN7qB_N9Nmc%qgVV@d3)oWU2FMA7hiQ}uRJZe8!HypMt{+O67Zx-$7wypA*j
z%kP?>RBphypNPx15FruRp|Voi^J({SBTY~K_gAI^FXjWE*Grye{FT6+L%b)XMQtsy
z+<b`Tb3P70aZSM$iSj}831aNp=vc)=)1N7P;7EA&xDteLXJhae+3G5Sxf?R*KMGp;
zPfy8%9T3Aciy4%!_<=v>BE`A-Fu=@nzu*JyR=%&>$Flv8t%q%Z@c5nJ@Dtl50r45z
zKus-AZTSzhKg5&P*Z+wv&#oQ9|K1&KX=`&H_dAuET6D{|rUp#Bz!|>jQJ|zU-xC}m
ztyw864s^o7z?k==(lf+#(i|V_&SjpWa5glGX=g6wocxZF1yFDrk<QOhn|40W7)w#_
z6ker=3S$dO9W+Fuu27U@%gaj^w(hNqMyjOOw-jP9Pp*?EXYVy#GBC$<-Wu3n87Zw~
zfH<2;zeezyd-<3#>s>3w>k?(uvwC}H(}NaD$52i}BsuY>Hpo`@xDn6n`yO7!KR+lo
z53d(h*=K?#e_j$x!So;Lu2*yWZxhhy?Kka?@2~Ig$?tnq*ByA(eFu<e6)6?I3oU<a
zFkr#Bewl1rfho|3dIts7jJd~`ffMO*=qHp>K^mt<FBpZYpM9oT$b24?s=l+{O5*AW
z_KK@{|NX%ti63=J|H50-up80~XE+QHd~8-4vQL#$q`l93fT6zB;eOP?Qdf;*(-oli
z0Qxz2nFH@Yc7ANf#)xTWUko%={DPy>M1d#(L?{|4Tt31!P0opSC1YbL`~Xy2q4D?~
z<E}HrKn4xHRwh+YTSXW-390wO$adBL4!9(djC^2wXC`zj-vlAYs+W0cg~5E3!?8L2
zw_UO*-V`3Df0|1R)h>aEgv6YnDT2a=GtKREfUoaMpUd%za48S9AZcnGvAw?!MTqmP
zty;w4J%09!{C-f^@V*{gB!YLhy1}eT`YHzUHZCOng8nOtfOXMoTie=f_X}!omdZZ_
z$HL<NQoFok<6R2KTi0c+M+3s|<!I4CGH-#EZZSe1&LIY8do?AVh#BCzDvF`8>vjfZ
z<@YNo^m*H+yd{F;dXQ6h9&#p*Av68?zzg=CC!{BRk(DGY#`w8ydi}=T8;fN80cqdM
zfwFFKRSb{n5o~7&AX0f}ozk_$NDdrpxyaLC`3~F)$vSDwnyKH#z6>b|qXG>S02EGw
zJ!NP9a?8bUI3rWAV}B?!8>nH309q!RkQV#Bo~lQsl_lUF@j8~wXJv_d4CDCyLqgON
z%Owl@m-gEBsUBaLOnSe0k@2tipt)PRzX5h<*uNSkT^h;lU3La-Pl0rAq)Ib(HAP@i
z<ZLncEn+Z;H$wPim)|LXmH;r~#d=rT#SCEjQX0&Q*@mV`wgM={d@e|$l<(LR1E~U%
zRi2cX5j=3QZNC!8Ffz}0)ytv9b+Jra*(}Pr`}!-~H)mbc7wHq;EtZ*;Om8OyM03sI
zljPAWyy6L$du!1*4}E`G0~bGb$=1*@RQ!C_)fl1yqktZqOk$)Eun_gLM<<nhzEk>!
zg<kj`y=90rl#o4C2klep2IJ#nV568GGxue)1Z0DHYgz>os5(8U$UyZoMl`@##^uX9
zdYVza9U^T0s>w|v!02katHd?^RIgX@bw*9eypO)PK>;brI%B}BG02ZKB__+t-rt!4
zPnSw==CG#?ib+Fa4Vl8JXBcUK^Hp_#gXrrn&{BlvVGn~E^<cTje3FxFFGSf`7+?&F
zcwpbsGyT{ljUp={PcSDSR*x3JD`0PHtMhKKkQs=q9)f{rQA5b?E#U;P&3@-ym6Glg
z3rF@RsUF7)<lLHm0C{ednXfgeYo5*^6u7kkCd#ss6D?A5Iz|`VlzxY<*EMYn^@c1z
zb^PrQ;;9d<Fl#!YYMFbJV7}1b-ix5DKDhaJ>_LM9ou}m$$Qkw;8MwleYVy3v?`Ju}
z$Y0e=P!v0br#wR;fLx;)?(Tm!DtY}Z?@&j{P&1d776y@3lZ^GjP@g{*k|M;2#Fxar
zS$byWT2Ofaf?xQuJ~kpC7789e(=_Qy8KbH1vhbfE#pTyfIRlv82~>d>NSxt76^-$r
zQpl#uR2M<T75aJ#JP51RB*&?^xXcSWN^)wAR3A|xr>zF28aaGxsX<B&tG$vU8lw(;
zj-Ffw|M4hiieDSH?42#<^b22yze!k*u*Su5qIdd?M3sk&5`N^SjhEG<_C_s4R{j3c
zW}vV%MBaezt$Uj<cf=jg{f;xRMa7jMsq{nGiMH`)VSCcs5_36t%}I#bzx?UmV}K9<
zfpD41`}X_g+5ft`kLKe>MUH_o`zC0%j*JAe-%VE`Zp^k^g^A{AiRTd6W%DF!#O?sq
zQ1YcN0qmUbhzh2oaXzyTjPk{au>R%y++bg?8sD$;;8V?#P||WgHWqg{p&-5tL%{P~
zUAve5P;Peb=g+;p@aIb$Q<(u{_V)Hcizvx!VG|eJ()II;4TI=cVA?dO7OR2;nj{Qc
zhZqC+aMj>2JCvPFMMcHLkl%i8vmNn=f8M?fg)^P6#OHbm;qUh=0alodWaSulE=`#T
zf=j7EhN$I&=3|9^&sajzr^O*85=$dwZzD+EgyyyF3B!`e8I8Lm`+tkF_Xixgxo%|<
z4C_~2_>!j@64bqdHrmhNc@W;6Ye65Zu4#Gm$jBk`Kb(9}P`Ohkk%U!aewRLDq1(!e
zoxkP~$WZ1=MEBV)%5ib!{H3upDZb0kxXyC6uI?ay1=+<IK$4B@o$L->lN)l=aiNLM
z|KWr!b7!JrcQ~zeizN*}ED@`fS8;_PxtZdm`ZCGIRFDlj-NntT4cj$fJE+yzlC}xz
zra~_Ew9VCd&EBrp6>jMaRI2YmG%e36Y5$k2@Hz$Z+)ABlCnfgvVlmrX_i9@|RTVJ<
zY9N9Hf-D7i##-tE96jR`UlZz{X50S>Xz$H#B%7XlJmJ(icklbcpV~%c`o$G-87^p4
zqa$rAXTE}QY_ehm6xVYZjULgB1nn5tbPH373%MSHL)~shM4(}P{%tPshas1ImZl>8
z?~o-yPpUp?`jS5PP!)||jCq%KGRbsOoBQ6F!dD#vr!Ep?Ic*lwtJWK@Vb31j_I{b-
zj1o4D-u=m92Gf*$O)r_-kK5bX86kWTERBTrG6q@;f|iPhB3gCZaIz=z1g;svE!!qr
zMs<3;E9V&V-A_;TmXwxKT{lDbn)V%m5p4CT<KD07E^Eg>noW<kL-yZlo?T<lE0<$F
z_%0F!QjaEx$U|!D0f0PR?>NKBOW@4OQ@)dResv=$k9>BVy@2$$(K%ceB1x$$8F#Oy
zr@N-pgOE5{lmj9Y`x(W{XU3wshI|J_sl9fa*XaHVeS|KDH)Hm24x}fa0K3?$4RNnJ
zg*DIx6YF=xAcqv^Dqzuap(*2zCk2!=xRQXtx>a#3F{b%l9_VZWSvII!o6J{OL7cK&
z;OQmv8#kW&o)jhZX%34R^&mMYBy4&z|07Mv$AtyiuzNjYni|GHd}7hB^k|%!<l7lj
z_U7yA8F-X%nHUT~ag&v}u*Sv9z!@TzdiGbS<L(U5JY4#-FtQyT?cC9+=3>_~bgB1h
zA~NMhBG!#Z5h!U=KHvn?m<9YKY*ph6%!381Q3#(;_x5N=x@k)xn9uC|9w@SIbj|Od
z4e;8Vyr-6Lm+xKX(x29Ykd4uw$HZ}5GBSn~st6U?rNAs?C{Rg;E&kwng+ZKAK^NaY
zk$?;0+eIGiCvTx3qgJhxkT;bK?jE4g$s_67_&h<1y|dmTcXxY{v2G2v*+J#{MsUDp
zV-BI14U_M(sEr9idK8kFF8vG)c332jPv*ibAfn4Q9D_Sis$4scJK6(+er^2^PnFYR
zF-RoTP?{nxno?BM1>!3uRy?<mV-QAkbidj&s`r#gVtJLvrNvAvj&J~blg1uHvFLLB
zcPGvm^r}n(059wttgqZ=>c21&Ykwyc1Vj3l)w<6?`_!0^sPPC?cJ-qEVK^_LBpEAJ
z8R#{bG2kZP1P1;o=~#{NOkreRgID0)t(a=G80ai&-0<1x?xm0y#X59(9tg{3Ra<Is
zO|iYY?b9A~T%W0RT#c%M*P|pSq9=l`Jb`dnx=}=eRSK45iW4E4cd&f0<Rq?@O?Lc8
zE-Sd!QY)X90o>rpRYcQ{cis6&82jNAoZ8PM-`f*AY-$xR^DGT)+-Awnxz}aef(O?h
zGaiuh>*{Wx=e9}+HwX)EHme??YQOV?-u0v;HT@XlqL5dy&8yRq{66V3;I6RtD8TPg
zL8X2k!1VeZs{{FA2Jki98VDeuFEaj;DyGP84ky*Jj@r%IFlma2DR8`3UqGwzju8`U
zOM=%vCnoBTV6j}BycV5&uTIzIIC?8nD@rQQCPeoyZwwTbX4z?$F~siBNH7oyXsu{<
z<|><#0fI}{8;cFII8`2x!9Bl%WGRGfW;OZ1lM4HkS0uJX<cw=p;#;7LnhpuzM}tqZ
zrbM2%kWqoCmej0}SEWv2PY`oQA-;VQ*{aK544~L2AdwdvHs&^;o>FzZ5_;v#h56q_
zsg8oV@^TE2Rg|7K6VYtiUn-b?P+;8YS2<hv<@iS>`JN!k?|K<gT{%BtD*2tII^~^1
zX{J}jH&Jn3$aO>>?5o=P%FLZgp?p_xclX!G{D<PUncq-*%MkneGY}{*AzK4;@*!*G
zt9KN$o~>607}>~}Bb>P`;+|7QiUz<+sJ!(#6dOzJFIXN^C&YE2B4N_b1xis$y<knG
zGXiiBAkP_;mC%kPf*?`cb3V5Yq#R&1qkEjoZI2b5fPIT)0Rh9$bgZ-lI29n^Y~ES=
z?C#q=>H*u+Pg#3&b6bHLAv;UY2L(W3GF~~waaw}NZ-FUrTBvg{u*F<Cd(2}b>eY<5
z=P+QAScBc!-@UiJi3C|uQR?TeLoP@9Mc$l56u5efDAWHdnFs9OGKW&$dB9%UUy9cJ
z5mLj+=P~Y~Vv3<C3*fgKuZqFLkV`!{2XgsUG@+&Fly;&8x6vm=n?T?<ee=PhJ^9eJ
zg%=mh0H+}pHrDirwX}3kgPbBU6~VAQ91(*lKub|>%Q<stXB|(4K!_wTnrS70E(tty
z)x9<yXzT7kel{HIQZVy1E?bu6ok|~)d7~`fN%PHGSb_)#->|^2_`bKa=c3O}-TM#4
zcgVI?0B_Sg8(0iRuk^d{R;AuaKzelZAci^H0u$z#NCLhi83I<zGP<inEEiR?EQnUz
zW}uoJoWB@tK&a@8>IetDI`ZkL>mw=;Da%}rn7-@5UaIq}8ogj@r8HL0=XlzM-hV$`
zA99hd)Hqw)BhcMZ&B15$^%|S_&5r`VsAeMTbpYE>m`s5sf9ZA5*rooT5a<EGM+Tx5
zmP9dn8iALcWObs9B-AWUd`s7blad%|43swi45!g`v}+MWQbv(YQliqUp3k`GcJ~8W
zfXD!q>e+@tI_$L^bG(!Xt8?4woZ0c-R2VNr!_-Se?|8z&^nF_ww&b8OQHmCa+SA{h
zEUw6n{cEe)_DN?tl2zPFMiaQ77xLd$9U2vnp5ocxmw)d&Sl_Y=AdI8G4{}RBPZpe$
z9GqEyeDx%g-rO8v@Jj03<olYJ3%l-Emyi1ezr*$)+FxoDy1FYCP;4?Vc)~jQ(YGhl
ztLxWJzh&PL)2d+)5C66zABYH4w#Wsb<BJt^0S~BiS1q~S=m`0DT}epMd~Hcl-4Gxe
zEp9dX`8U+mc?{2hBW-p_NT^`Dy(-``!oC99BbX3r@Qm*6f;Qb5_{u}|<ITms?JYs+
zCp~!b$rg=GbuVzx9;1){uI={kZyG~I*iX%^tsUi#>lgS@40H^}lXjRPcNI>9rH{kl
zpxeU<FDfs&A~#>_(-ZH?aIRz=$ylhXUL3A^(%0J`saQMY?$8vXEZ?P2I?y7dV}*OE
zf25()1`V~->b}&@A)Xf@Y!DAS41ildcUGotF`G+sIkUs(DFW9Dp7{Q3a9n6c8P@b{
zd3<aZxQ^lKQT50*VQp-<R>}n$y@Fw&-Q<?}!S(U_VAkv;eyKp&p6)&hhOc|sVI6Ws
zuIYej265hB+~2D1uG;<Q79HQ4*IW!x1#IXC8d*h;Xg3YHEPNo&wpIjLHVkxrI2>Xq
zh~SiauOh_T%Y3?SdN0H`*aE`^|IGTkvtyS!7PpLz_4O=!YPOtR7rM2(7#bQF(t~_)
zSht6ZO~2#fefgk3_OiIiK=n*sz>}lgBq1o<CDP_SR8~;EF-yn-w59m1UCfv-e2W6E
zFzreclozvn8%I6kkuhz-O+!3!WzM$PX1CR8v?L-PtS3;@ezs)49`$k|<w7#6wq(_9
zTm|f^L7_O8Wr{9H`A)J}%K18eV-)&v!Hw;$N8K#}Y^-@K%qKnH&-N`Ty~$!3a$W9C
zWeK|&JaAA6B98wP9a1;4W-ns#?}Bfpxx2ElF=a1KRD^SX?b`H_JC|$idVpDfmL2PZ
z=YhQ)J6w>S4@#y2g43ku?!NatE2{uW@=M5iw`wR$waEilF4NVkxx2c#iD<oj|3>61
z&Jyncq&OOi7pL-LW8sPp2o26xQ*mR5M8s}Y^qJ_rQD^ErEbP32xDV~yHjo_gtRg0n
z`S8E*gjy(HP5y)%m&1AOQ_ShyS|%NBc6h1j>w_s4qhWOZhIK|sQDqMfE{?xnSyg4%
z^^Pcjvs%crW;5t<wY5TrdB3joyiqQAkRF7Ph+4gWC0m|ZIC_eF8Z=G{j#}&1+x1Ka
zBJ~lJ66l8r60(4_{--x|4@3b2j#Yb&H4MZHs$_U7esUAl2_sZpER8Ix6;PZaDRMex
zX0{7jlzhn-&>2zPUjZVwmSf{L2;mUeP12xwlBKyh)=$9h-sex8rPf6P__(vM?52k=
zX9`@?Qx8kfa-{s0K7}U=95^XCu*z<3hHW;)7@Xr|kMs2~j&-$9P2+Q@T+rewYB0)-
zWe|+%JAN?+h=!CJc8)M}tgRc-^6B^M01%leR|!gd#gNES&SE?5M)Opc!Tq<SBuDW`
z$~TBE_CP2!N1k>9wjpI2^zvv&(UFC%kv9uxAf!U^l0{SH7!o^TYr@{}Zd^?R_%nK3
zl-a+zE8mpW+h6-IaKCGNYkwKFJ5#M0vc6Cqu&&zmP60!KfgT;uxDvmV%p2pka;T&&
zSP!*4i4RijhX80V12*%E7lzy<rY}|i&*1gipAnjxBD^fX4xZSNTLfH-<jG}W-xM^0
zVhX-T+ESE?d=j>o&EO3CVJU&hevQi^d%_XhKcg4Dyu8A+reB`?gw*>iLH+Iz{#lrD
zKlt@!WnH0l=^~1festeyLE+=ZM@R2JdWz!h<o*C4SN*f@3piCY5dt^ndV&&>OI0W3
zlP-X-S5Ove9V3MoLWuLacg1{tSZ`e9hdd9r9Y02FHVfS)Af_JHes2>?N$>9D4BcL+
z8{b`6B$|T=#ZH%F$ZL{&loj{pNYiGwSLpT)-FI?5(KkuANCN`cd6_#J*>Y&;D<!5e
zmsff|7U$E+Tz0a)anYjFzmA5jb)9>7Fz=iFAUs8O60Rn~E-@t^y7{#7&%+wwW4$~d
zf~`|UFayCSxR8v82k{jQEVu2e+WS-io#b#yIJHe1K}w2MeHap=g7Nd{7ibvqs8l&2
zWSby>zKd|J8#O4H3Hp1*ypxnPvXJlDR5RhxPa)@!$k-Sl8@H`HC^F|{H=m0<l7d^i
zNd5iMdbdwQ9!|U^FkHeJrar*N9&d%d$NoV*#R-GlOv50O%^f&j*d!{RpBt;VO&{`X
z0xyodBqE79`4h}?z*bfB!72P_?5BJ|?v#n#^^!fdsm|?ulD|h$+(FT??Nah8x7zee
ztG3W78T{H<GB&Oc=ry-@_x>G@ZuX{HK(!5APX$~tf$R3v@V}P&B<OsP@=O9EppyAM
zwoO#+6g~+%@HY;M8qg}#<+}JW-jei}{2bd{<5pYpM_d4Z{ip3}IC93%UJTIrY07F$
zvY+)+S(k{e*f*s!%|R~@rwo(iyM8rXAbkOvTDgWf$?xU?5<l4zVeEKg!tRz|5ol^#
zK4VLK_nGB3$nuUu{_(FXS>%G7$MR#rGAo~y%$IBl5q%F4C0oeU&tF<32aGwfhyBB~
zg6tz62ZSBP4io6VY46Mn|0MH?w%SQ^<4MO=5?)ji!WB$kkkS?W1A^DLR_9^t6eViw
z8&P)792;)l`Z+U1&DW8lUZI@Z*^hl=hEUcXaJ`s0tq|{<cubr{v-?{F9ozjgCgzh`
z(MK=jK{WgVwo)I9fZsNe<itvT8|L7Elm^*zC_0@v>VT{>z_5u9+MfIN^9Yum$S=CY
zm#b4aQJ6U2Q&fUL6oteX3!alVaoF{Ld($Hu+f!kR*z7V(OC)rQnL{l<X*O!W9S>V2
zzr3fPLyGuP>EV|X1xs<wGuQX;!>DBb@B}U3VE761n7*tYYW7%DXoxy0RT4k^AM6~H
zY-Ccff}|$KTOsn9Ez70g5#Pna53bov%x;QCD#RC9ejY4cRC57yK@7YH$@>cHBMS5V
zii6Ujr>^M3`6Vm`IXS|`wV#^Tr++`3^MS|;NA?BiLy-xrcQPe^6vh3<N?h$lG|gC3
z#e_H?&)tprQnZhjuCbsZg!^M%$&tR}Xy0b4ycv*u;v}7u11_j}oyKtCNm4`f7(nVa
zi>Bf`5)c3O@^okj0c=jJUp_K_=vanxR`xMR;un+tJd%MpV!ER{hN2HyTo0Dh!<6w$
z<s}I&{_$K_y+nUcQgSgyuC|o$zmIcqZq29$ouCCTuHzJ#>85|(X?bw`sbJ7I_Op6(
zv2!e@e33opG)9uxthOX+z|$8^a&k!Y^{h~_FCSeqA<Sj=;4WdSN=%^+34eD(LPJAK
zGop}3BTI_(jN`n~P9ImMS<Fe+EdYx*MNFlkmynTW3BF7JFnx>ht|%9saqD@#0wX&B
zu*rUQh$-x(Ey?l0pk~A)687<u?Ddg>Gl>N+*mcSIvky&iMWde97hX)N(TYah1GY07
z7*f-g!PbOSZ=u1VC@}YD&=7hB({*0sd;dgxtuVfm<Qbv!SBd=dXho;CCh*<29IJz!
zJH-b9z?=&6&e{^!b&qcGE}J}lPPb3Cun1|2fD49;($*akuz#-?QP%a~O7=5qIcyFz
zr;8g9kjH2zD3t|l#V83cp>7m#gqrx4H@Ob0P^Nwe;eXtxsYIcdi)2Z5ml7-st<EA<
zO>!%%diJw5cZY(h$wE=#KC9DR-ojVmw+91o8EHN|aJo%f3v!z53+5bmDdH0R0#82L
zoKibQdIvBb-VqUOirSL=NnLvA5e!{`DU<(Ru4`a0Su3W*btR<wfN1J+^Ja48HTmZ#
z_^D<k54yYc4aJ-=t;QtVSDtdxO>X6*=4fA2cq{pNV!ezazSv`Aj%a}y7_c6%+;N6o
zfBSxdF)-|Bzo2f$x~|Ia)AoR?M~#{+cG={`dXC44nsqy#dkpBo`jqd9(*Mh3T`}!)
zt5qSs&9Y{maj9Rx(Uf8A{M=d{oE9Y?*Cc+<DxA$u;4~cZ$|I#Mw*w$CHFW3dxf@>i
z5h1uAH2m;KE>3hIT^p>&f)<f3A8j*T%@|P=?<atO1Mpq>*liRjJY3?R2QoWSoy&SS
z7b9Wj7!TB8Qc-4&DCa>+>s{#|4)^4e#Yj#aoTU{%w#d_`4*PaK;X{M!K4EXtF9hmn
zWOHWj`Khd1yLI!LHHDm|nC2~sG*;!n0nyFaWyy`drA-Xk_k~=ueu{@k%|;yGL-$PG
z?p-q>>FIucX&w>NQzIojya2)@1|<~}=xgi~zqe-lnJ}Fk)ImoD#!_y+#~%>~ws0`X
z0C?y+?BCkYaQZI`MC&jN?__+hhvsXR;0Wnj7<D*ZGvXCDOJb1v60*+@4#4k-JbdPD
zdxt`_QfU3p?n|cug#wESs-EY(5C$YRoA>32)h4H%It3Bh=oqWou^wn7dnEq^8d7$B
zOqL=*H8k}1^=%Od=!u-j;{$Z%=;%RQNB^vkn(%47k1jiZ8TX|}bf3U0D>}e`4CtFz
z9&W4*?r&@|vqQsq?4FGNygLkZnxmf5k#AW5ng;eaR)maa-ww!06kCY&q*-<julDDz
z!?kP9{}sd!S4E;NKEpZKNryX8uESB@TTAx>LIaAD_qxpfVJIsXqpIe?xVo+O)!)!P
zMaqxh#YfIzdLlX|IV}<St`Hzkq*tiM!-=N-_cgL07@iJXYMD~JwlR`ksL~m8_VOJY
zD@V*soY+ilHH9eXLA(j17<QO4(rZ!@zzb9hstg1?=jBSQ9sV{9{fL?bl8&y_c7Ycu
z8ZNiVY$p+vn-zUH%uy|D?EIWv%z!vCjr0|F8qh0y5iXr+Rv1DHCxx6G$5L{XJ4HBy
zH}=UP0=6e5A33s6HDBAcso-FWpl9r6T~m3MipF4u)KBXu(eL%Z`cR9R=ZGFEEqChR
z5J4l%@?^B5P7H;=)25Tl*AT^riF3{4#}<?jwsSby0mGVsH9%~G3&p6RGdEk^_n7Z)
z2Z&RovrP~CFz%`VIH{8V^(Z*up^hhjDba>sEL<i{2SEPd*GDmY>^;M-=?p);&&Y#4
zC>UDs1bVeaWqN00L&>B+nJG@OjD64~H#N_OMqI~KdbDt!A|KZ`VvGKW5}7m>G((_V
z%3dX5Pt)=W^ZvrHOJ^Z(Z}L0=r*Sul8bDALd3gD{Pszcx2pg;^k?&kQBfshp1E<>>
zfy}S<Qqc#m1i)d(7|a&2Uy(puM_3CR#}BfHoyswZ5wM80cD~B_v`q`DO#g<u(Yd+I
zqLM5q-bXWlqrS@HF#n>x{bUfc>?ZWNZ`H^Gsg}-lCMzK|NM)pIoCk|3Sb00@@8Ns-
zZI0(&qS(oBEHuezW!Xo)yWc5Jo9Jgh))=(@uWbc(J~ek%%uiaB<S5=twm>8FpmZsY
zLXrUcZ5bP%xVYtfS3wHP$(2%0{c%*E@WOYEp*J7sd;~(Nd{lI~Q#tZQnSjnH2;*9;
zLGpezOXfy&7B#-@Ife+|xgO}9YzmQ04ChVhx^C&%6uLFgY&=Kh{xoR$qLe=U052;~
zpPZNBh1UEj3*%=8ZGqZ9@1?fZr(bLhNZTK>9jG@(+#)Mt00zTXB%)W4w<shWnO;^a
zLgIUHR=z*gC<cC^Inom6;cs8p)bQVPEM+kA@POaK9mR*+3)L}Tba3G2NENEB%h{&~
z;p`vWI1bcvkx4ZB(5?BP&~kvDvk{%H7`o5wlHgd>$tQ{7l^?YQ^GCRiqT#75HJ;sC
zbxh~QiX_&=aSGWyiW0bZG@2Rl1$-_9;8Q1l9>>ewBn7U{g;bArvkVY`aF>?<Y^KsG
z!(`Ckqd$#GbqP~&$nR2vTM@G*4dSdwyl5n_NqB&8&gsS7(4BV2E002wuoj&ZfIzME
zR6Eu6Zf|9Jwu{*dZ{;taO^5NpO`#tn@20O|{_r}b=XY)yBZAD*^J!raW<4wdc^3UX
z?l8sqlKa`Se^8XBy}#Z~O`D**(U6d>dHIUSF-&33A|dk)yBI=(f9a=sylrtj!hI`#
z=xzt)YoWtU>4+(x<@QN7MGU2I7>($wIwEo3_M}LJN;)<R-GlO|K0q-xaNwp3&~8+y
zB;N~1nOJ-l%&4jv1y_H9eX<un8gw_QmfRtghZfc^(ci3V8(fp!#~2jo4kRyc#Q>~3
zKQOTOE2ZJtY#5Pej!mjj)1bEyR3URQHTzwKXJGEfE3KOhxtOyN(sV11{Osyy>q9Oh
zDiCvDZJlSRnUaO?A7H-r{Pz8el6tMIS+XzKtk~v9eD}A<^N@a#TkOKZU}E`UDv=Ww
zJV0!cI<f12Nx-B2FqlB~q_X$a#Hp>ErW^PrPZ8r3F+)l{oD=so8bbFS?lqP?T|awR
zzf%e9Gcm5hS0(ebmd7qJJCkRk2N>No1}N10jPq(wMrWK?Iv^F6-AVr@ayi%7fD7_5
zn2AP+5<jqaxn(wO%{dtp6G5b9<Y>#C3AEZg!_Ri6E!cXf5MzXJ9ubgmMmP&oUDJt)
zMA+g1{^qvpg3Mx8Pt%*r7RoY&f4U%j#h!Q-+<YwaZO=1+Ry689Vvg~*cho6`9R^|$
zYK7&?f!bm}!tsX}=|Da~SAyK6*)PnItudGaa7>CNy<WUTxsQ+Qdyr+l|2sU9u7q>+
zeGJ~;t*+_Tbt-XUDOK}co@>eE+2SKem}s1$!)*{mD>80D9FXaC%+d_m5_bB%{qrGk
z*n#}rZmHLJ0~`{^3xR&5k_i(!CxlxCQk{RsxzP||35bL1O5I_4(v)NbQEd?)hj`$6
zUz3N#m#fV7n$k>P%6FX$@Keupg5nCQJXA7FK#h1-;Qt!yXH3>m?-dsw1m?LT2@sk^
z&xxIsd~JDJ6pa{Y2S5}wjJM=LAjhjEzv-3i%>Kb)&g+AR<sZi58Vj`!l$|WZHXJ2t
z!S`L2gZmzkPH9~xVWp}LLL?;y^&}+h?m28$X)SfGN?yNcC`hXvXkq;fRQpo<>iqZh
z$6p_mk2F09{E#HiEcY!CQR#dmU@`LFg8!vnup63JIQ%hSBpe2gTS_J|-|yyDmV8X5
z3y5M2w4uvAL5qTTaHv%fQiMs#F@1TE5<Fwke4MFv_ZEXNvw><s7m_DhzmT3|Qx2<D
ztGg6IPZ(v*6!1Bgob%Er@b}%h#G{k#E5<?h-uk=Q5BxvX;|+B+Ly<60i(1-Pp?KVY
zsLL}pCv@Vj_>^F^&%a0d)Q#-Sw2Qoig>F^v?><!H%E_}t+>sM^W`~}kZu(piNw`pL
zwe5S!T?O0e{4zWH*rRcefXlh3>1s~9+HVKF#)L*eHuRLR>1RR0Ddk?@m_K!kV~xS<
zo;bQ=$nHd@G*okMO%?m5<>$p7{Gd|lc)fOOA<8rTj%~{1GTEmj#1~^Qct66|<&1}<
zQL(yfJ<Bep`;LWID+x=fAieg$^`~vIxuxeX)J;qhjn0AIFX?r~FEWbxdu|3e?07Qv
zW@hqT(_tF;;`2_{X^|v_w%?Bvki3|Sxn^URtTD5-p5@c1bJHue@DN)%27#owI-Dfn
zJ0W@qlb3HjgC6euo$ku96XEr_5)M-?9qsM+8-IbN8<Vig%r~*x5$o+OIg53>p1qRY
zX*8Azp+Y5&3u9>Pntpzfd`z1=2kcl8w8vnFkQ#w=Ey%;h*8f*1DRF{f%U#ZBhn@d8
zl79_%l1QambR1fIgeEgN%cM>j#C?*a-O-}!UhZ--P7dM7yAKH3zqg;2<17rrAJ%-P
z_P<1yfT;`e{IKaXSs<5m82-(gOTVWEG{;mB6hB*htlxBaj9(^0$ph;Bl1WxXci6l!
zBPNnd0R=S?%ZKLUW}sx=#6N)xn_ZYMW<01Kf!|rwah}0qbf7UcuT?7)P?WTHGJL$!
zG4BAWgb15)HbwGbh>0fn^ePGBO?ua~JFVP*75;r7YQhLXW(jm?n328;Sj2hTr(!XK
ztH~Lh5Yja`g^@pBVzq)X(a{CWWEGrir=dytSB&%L7>cV9f`U9x{GuW|`57fo5hap)
zQ(e;`VMvUi_3&+(eucpb;ufp<zDYh7YnT(U8oIN-zf0BJ?T{<+5Rz-{?p`=H9=gAq
zFfJ&&+@_R*WtSvM`$&{Op0G2@#EL@nGo&K^yYz-kW1k?i81xA)IOS$scu)u`TWyk<
z81Y1!5_Ky~I@4W2A8Z7+s*>$s7{UJPfB>`sy2JXh%O_JG;JhX`aYWoiMDo9RLK0Ne
zf}0EE5P9_t&KvM5IB1&^6HCQfhR+UMea#<6=zv4aeSoOR7+zZcaA&x7z`Y9?&q)x(
zw0ihX)9vVKy`#30oOE4h?tA)~O`R(jnSu7#U;_Y5!{IotdX)`UHlJ~;d?D%xWoP!@
zy&l|r=Lyp^6!X!b1v=}jNCNdmz@Ix!(iuN}1hccVv2k%&%#(6ycq>Xe`S&oR#$2;s
z>y5%4BU?=#)#`wF*S*!yy;8@}-4C3};2Sa;(BS*A=3?(tm0v$^$>*tOKKr;$E;r|c
zo<P?JvB2*h#RTbf(qPvaiVWrbgu47;m)|pMRs1_E%hs4r3%k3EA^Y2drGJh^GpWqS
zX0HE56U~p`D)yVy2tF-v+<rVwf4rh@v1z-J1X|ZjR|nsmuPYo#d?!4wA4R-q;@k5C
z_Vvi=9-DzjI~$V$V4-amx|g7dJc@bp&vyEuCUCcwrzL+jJ;<R^qs{e-NN?|eR~8>#
z05qD{k1gi$h3>Jk!xOi+2P0Un!kQru>ze>)7Hk%><m$RZnno(YC$3X~>x*l8@)o#D
zu!WgRQTQU-%}vg225MZS4_Ec$s^g$=0BR^&9XpMmTd6zikC|QYKSvUf@88t$?)e`_
z=N`}W|3~o=HJ36{O(m_;*AUGmCZUm-`&>dUA%rBCx#gB7(ny+1%q=##O}XY;!o-+r
zA>@`=ETrY@&hP#G^=I{Xlx?5)-mmjI=Xqj9bP_**FU_1hMTk=xxNy$1oYP1Sur6^>
za08*kTNPztZ5Tl$#YUskwlqa32A6J`t6+KmMOvvZ%B5rT4b7-8GGx4%L`rq|4R__Y
zHt!)`*v#I(Q(tzQpM42dHM#(Mietjn?7K+V%WlL$9QJ;uKwRtxiGK)nCeylnCm|j2
zuN34?+T2NpeTJVS@Wt>atkQfh+U(h{<b>qohawJH)Q$O4tj0I}kAjnA>Q}i)TNj){
z0#yer9vIyYTXOQonAS6n@^!zwLsgyXE<g$%zg?$`oeNU8^$BE-ufw0=^u0P`HejtM
zco78d3~s*&zfodIvM)P~wtNJ*DBVZvK<P;+)WvaVfCgSoOPP{7$(1yea`!&fyG%Hn
za*};&&0N<q=~MQCjZe$qqcaVsfb$%a;Qrjrn1n*OCtT>X@|`C8Lu@nLYY!_guBw^1
z;h5l$Ge*{Is)trWi}aAzW*!wufs0&rKUedRyhFnFosfI?3TvsP#kx_LuC;c60`y&T
zFPr-*%o!H%QdoOR+?6;!%s~iJ#GxyTPUqbKtBDRa2&=(lI(MCyudXRcsnLFr_-SBM
z&u7EBXZ&8w3&L%aBh9;WK#%L+U9RjmGxKGI9Sgv&iquc+{@y+>47JT^T4^f7J@jGl
z@I&2WA108L=?ywJYYsN$gp@g+n-a!R3H}QC`PXwqEP!_79dw$~alC<4Mf$V0_O+%@
zZ6fMoAB)R+R-Nv|5rpxQmkXo@0eU$=pJGK!iEThh!fq<+VFdXe2Q$%2m>#gLfYM9E
zkuxy2g-UGS^D_|h52_&rUt-BZt_pk*ndTolDV47c25{I)`Hz3_dDi!kF}Q-nn$I((
z#)T&%UE)Z*sy{5Tc3A3Doy%AQg&8@*IZ6$!ETzgfY@p<#=kpZ8qp)1XX@`mgq&xG9
zyEPavn(BOVMdIJ?{IwLmj4VkL1fi_8u-uHHO6LW?VDwZ~IddI5UheT!&=2_AWDi+l
z(IL#m;8DNU$a~=HyldLyE0D(O+h3vc=)7TeZ&JRf^o-0zXt;Nb-MNYuK?s6wT;D$B
zGlVaOYC0tnPY5aEy6-#CEe_PWg`MA<6Tm@fF1WL<y%V5$0*TO9v-|EX>FSCRKbe_G
z`kZOwWoNk6#qx6UkX{L{*FJHTjcF?{{kHdYF4A{&MOP$YfLB$qdT=&=V(a(Z@@cm(
zHH}_D_m3asO<}FC1+BR*%x}38Z+o@JeZ<PSm*XLmEr2(}YLutfe7dUtlDoJ(xBhl3
zBRX_*aqQLYVup(8X^7^fOLlKTfbke>Ypq?R(rNvPRFtDlWfD&eR$C*-3_k*DGyrYL
zzW*#;V%iwl*%11)6jW;d=Ud0=3w)fO64y;M&(c>L30*Qo;_e1ktI@v<3sv6NqgYk?
zIC|PBHHueZDLnb7W&`N3EzLZ3nBAbevar1YU4|qU0WU78+FzgpZM*1P(Sz*NSaKzL
z6;3pV1y)Z)udPk3Z#m2)M^5a4u{Ax6k^Xeo_W&+>*=gZ>xn%L{qVn1k5BFnU*IjTF
z6*}shRrd)it>-hgu4xML=$qJrq9T&RYqbQ(ySSetHU_2wVW|F(VOh<Oi9!NJUg1c-
z83)UBq4W%y5fX;@NXg!R3eRhUEg;DtH)$0CHnO42uv|9jDayszvd|D`pxWQMx9?yN
z#9$%>)LO?loVVBRSB4<o$sMot%IbSH6&nwNkjq*yjAajM26sMG%C&sE@^hBIJxq?&
z;0uO-K**msB7^F{J3i@hGk)!Ry|8cS`kUHdyTQc8bNr=~v0_<REZT)WJ*S5pUI<nl
z%O;qsk`ZgNS(Wi-#HZ$97hbwsPQw!>t#j)qCP3^iK!J}fAqp*d{v7DUH?pYy&0nMs
z9(mzqd~RyIn6=K!Z`lK8UN<O$)0oV<RpCxPJ<FQ=(ZWDx;0UCZ6vx?_2+dVrBdCCu
z)d_wiAJ&C<%*y~N6}|nN)4#-XS+1ICL^SlmNAJ!rF47KIAd9y&f(r2K*P_!J)qBll
z*v5Auuf}JIRX3P%*lu)&7#tB*i`w|XYGejc$tb2RsIW>M@<LV=h+yfKpu{_a+faIr
z?+Dfq^D^)P-_X%9{McBZ3AS6~$QBlQQo7<(eN#3dXwf+CS%85&FMp)!qyt0bY_}Ix
zUYdwJq_eji7FuGYgA~J{-5{#xh=*+g-rHWfIdjEGM3Hwb193C#K8xkUR4Una7+7sz
zxfX*FawDPiK{YYu=1GE3E-K31okvI5AL-s6#tOk=mwXdUcx)D;+CQRRgXaV}GKu-i
z93Ka3fjfacK%TOnwKD+OJY%qLyi~~v4~cFX-H#J|qQ$<oaMUe}+CX|yTR=F^bmWZG
z9R!Ho6WZDjVK7E8iwzPkQPq;9&ymb}mx-E@TO~>MR8a#9++W?UH<`2if-b~sA26Lf
zw|sGeq*Qp=0vsnC->#hgCw^jkD%)=&a${{7Y?ryAXpw9X!r6e~1QF+ZVsJ<+RuKMy
z<cof!A~!%?Q<3{wMbbT1$>x#>#=zh;N$GFP*T;epKE~Di0yMR>!H#rVYPTPtrK7hq
zcK-!-&R@Yz>~`*qW^iOk^;=P_?5R6Tcc2LDT%Fhb6tKNlKdyM>o>pEnFd+$V-_^`3
z=2B>to4&hiYwCU6xd12XBh=t(%c_?ClJ5=T!oQn+9!<lGF+^2(Cz}0WXvAvYOWZYS
zUudqml5n>7#HSHq-FPVKA=PVeAh*AZyZ*hUPeSngl0e~5ekc4{N|lZUPL=wJI;Z{^
zHCS_W>DxMX^DrSL@wN-8h)ne=g|{$f$jUZ5poh->Pepa@>E;(V1ipP6zg?gGAO4r6
z;_Z84QL2%)A8|Z6p8Q(&S3wHH-54qP<SIL&4cDaB8}{`;`$vCz&7k9`5d!m@(a0wi
zLFA*or~pehKBz?3a*#$6OPHUOYQiGnbl)4?$1UY1z;&C8gxtP(-_yv+snnR`O{?su
zh^LzWM!fk^CR9qo<ada>LGgBuF?>tXN61`YFV*~}nsn5xcsm#fQ*5ugK_(XqopLB5
zy%%o@OmiwA<;Qw~6@q1=S${v;6M4o{lU-hfvxi~kH{EiMpocW?gKpsCqB*swE0C9#
ztGw@I`p}h8eF?H>mm=Ceue5y5f<VvWm~2La_pRE`nXIaE>q2M2B0{E7<agHui2!JU
zM!k$QT_wSte73?0**T(0NXYat!8563&Izl#myu<6!y&hT3ME%F2q_XbMpttFZ2kT9
zAOmC7;%@vDcZ%?CGLB?l#!C^?DI%o}f33?&c_Dl32SW>Pv<!HjaEjx^&}Ec3+$^KG
zt83w7V7CAd^SO8|<PZi*`_|aHyOOcHQn||sRo^S+lOa!7TO9S<T@;D-%sNYe#^C0$
zxxL6VT&ctZrSJh^!4e<<#lhl8!cYD6CHfL@2}kRU1zgg3Nl~neOA5u@U*Asgk>0bt
zI(V^VVU9hP{!E_xjoL4(C*=}KVlD)z35$rV0^^>2*?)q87F`@ItvdvAFe{w+eP`F2
z8&}^(t^J;}WXcfkj$h<t$(Ief+xddmTpnuu&?4;!<%<w(s{E{?vYwuO>Bp+~(`CXx
zW_kkqbV?LXW1to9yN*N$ZNJwr&L@0%BJL(X^P@ZsXO5LPn7|`C<(_N-X%;zj*2E`0
zVuB1h97Dg|29N8GpQxc}eo->Ybz-sZI_4*L>aON-iMWC#@Y#fM=YsM{u025FbIN`Q
zn<q<k85zIEO~s|~3%Q@9z*5=dW)ZY>Q@KuzQ0s7zuNx>svlINZDoOdcQXyXcBh%Fw
zSy|?;*$E5OXGbvK(5hiozM#%+V2<~N)t{Z6agE3P(Ob)E6A|3SA!qF!&hA#;IJ(4|
zr^;T6Q!SW+y0`SejG%L0tk4W4D#)2HuVs86R6S3Ziz#jOvDUa;i41Ev^K0==niawp
ztAu<*<N8H&-GrlRnWL0X8&e=rpyn6Yjo9mJOu1C?_|owMz~-RttU&xj<G}vGsJoSa
z%EGzUh+<S6)sm;($@fXPgb<f3t))8Mq5UWV$R$NZ?T%+s4~ytck_WbfCstJj5(2gQ
z!LEkJo7fZ8i{v@UCgair!=s`Ov;k-)+i}880re2xPCAI_Y|zj>9Q)Ws`P;8MM6dE%
zPL&f~2yupTGOqN^K0Pa(&C<|ZP!=`JZkAk)bFCg;SZ%8FvIIV|uLGvsUk_GMR+9E^
zSqmQA`Kdz%=lfWit4VXgO^!Js1N^1nflEYR($Z37*b4<mhKk-oJ~CAnznztiauR%m
z)Rr=3smzf0j*$It)QG=aYa)had9cZo%_le;r4cS9n4U1|%cCpElz{if$}|*(be>`b
zlbEJJJe#Y$hw%R*90l(F&7xuiN*Yb%V>0m<f7LZDcvO$==LAh*ZWRhhW3iO|(}}n`
z+a`IV--u2b!sL4cq)!S8Qbcx3Nc(N)sYZDBQ3PkG_gBG6(gZ{%_{?STP$5|g5v7{8
zTDoRnXk?A`AFL>ICn<1ClU#_Mu!`$Yg!L0$C*B|4l;I^m%x`WLDYeX@+VoLTN^2;F
z_G+ZDfOg=m{dl>f&y@lK7FmNbar0)I@6j%U71ExTJ6iS(gsYncwsZ<1NLFf_k`Utb
z%w6@w3{qM67g^+J-62dDyvwUR6v%MJP09W?g10LC0;vR<3ZGVhok2{mufJ*y|2lU2
zwfF}@04>dxAeoo9Gu<D(zOt>@HSLvqxQWIGD#^D$?zTs-cS*JG7S!=s$2dYtuL|X#
z9edRy+A{lAE)7R3=)yq@tW~uH5P`iXp7KKV<8DChynrQcdhUaasy(;#2^1x(UlD?!
z<8-ig0m-N^Her(diTX(d^n`6nO5fGc+WkGYy1S6KE9Ck*$^->@A(krMJh2qLRkyn{
z>KDEKcP^+kD6*ag<oYK%C%ec(QCln1;AD%vR&Y{h;H)dsS=q$<uSH)l4bveUqqs1%
z>)1|H+z_)dEK6V3&Q?CDD$9?(BqgVLOb9n(*xkA#g)NzUBG=0En#}Yp9B3b-S!N0I
z9UNZ_jvUg`maI)3D;vW6A-nPKBb%`sKeY@`+^TpR5q4a$aCLmR)+foPA~4u#NTV0k
zf}#RfY=z7oWZ316IWD`n>#EPC8DoJG5-j%e{l2KH(91C-CD?aenK%>^S+TQEv1l)Z
zy`^F@7>?}@Zdx9faQoF5?rDi7`7^^tYt_{1fs)wbps>5Mv>T=2xOm4e^G7kOv&%f7
z<iOjWrfKrZc7BaG*{~!AwiH49@bVq*{OK7Ty8?nmk6!1x|6=nKl(6r`prpXorL2kV
zK9Q#EOC+N7*!YA7r?ihT{CjI_D<ZVD_n<hg@}+%t_Gq@t*9*R80>`>U^Y_xyxBL1&
z*3w_PJN2{p;_em@Qmm9<xH(PX>nu3?SL4X^a)4%TTnhM4g0v~{BvlDB*_IUeK@8zP
zR0ZJ`?EQK~saPA;3yK(|aW#k5(#WR^tOQM@EwQs?rCIX-0bH-f7Dnp!Kzc(pPZax;
zGx$+H=ZvP1Z1^5X3F{0d2K65OEcI9|XN1PNJ_eeBU>Gcu_HRIgJjeRmFWnRj7U@dl
zYF<e4szqVbc=@X*V=WF_mi`?lTOP`|$d;k_j?R)Qglfd;=z=ft;@>GsvYd+J`L^tg
z8gT+q0_qZLgqXium?-J3_l)F#>>IYWb1*Rg_Oq-d^9XQTm66ELq#|_KVOXSWN`v^>
zHy{P1R5tWbNmjjb+!o_Cv7)Bd1hm@yCPeOb6r=Ny>yZdZPg=?;YMDtcPT>uTu0l1X
z;IO32L-oP<Y4wdsq4cA_p9XeI_*h*>cHYZVc2xi>Z&3>7E&!7za8)XUuXJFIfRC0X
zj+rc18Zt7Wmnau{kqc6&pRl0bh4=~WXIwy-B}LvvIiwxM8p=#In<2sX(u`M@ma{@e
zmh834$OP%mGZYz$46=#llU3*aa|jRWs7lLgtxJF@6^Hd@*`N(uCZwM9GgVDk<~<J!
zP7{8R13nOxK-OnNTpkC)Wl3Hf$>`cL@v2!t>mSyV+E@gfGT6l8ayRp$z}{lvPFZbH
zoKtd_^Th53%Q-4MJp9iE*n3F+g>NI7V9H+2mWn1}Pv*lgce^DRnU#`4kkdp#d&zcU
z3NOFXY-|)h4mOM=O1mA#;d=08fd9t^Bjt;b@ybCOL&K=skmuZ?i38bxVy`En4W(;w
z(w<E;(q0+IiQ1-x*;$YJi95*47JC>*XA(BY_P*XfzK?yrGj?un*+1(n$WZ4U(5)|j
zoccXT1BX4vQegdEUk<#K*7}~>lUB*2sG)n(mdET`L)YWfR&y+EY~XPKXC%ULp?gSs
zpvr3MQEj!Zp41lps3A7tX`sash9<Wx2reT{6vRk0HMIcLBe+PBZpUwr4lRu^Z-Aj!
znh!zN;pY$3iM5r*o}q-clKZ;~1Mg&=Md%IrHa{8MNEJFw6#5x&dRH{|U~HDY=m)D;
zb*j1($~^)updsZ`<V1P!r5q-i5oVu0K>gXDVJ*m$*<bwWqLd}?4JwL!8d8#D&Z?oj
ztCGjNgC3a{j7!#kn}wBt|K4#D*Y%v4t5Z2wVZ*#zjY87!J(8+cVSy2Q4>L;siGqA2
zzsINMpONm6VFg#fa7GfbLZ}P+7uob>P2u)e5+j2^Mw=O_U;y!F{dW+V3UV22mcQLK
z1nt*r5O@!~*RQTEnTJaxy&em$BoRNkFsYv$M-LO)*EN$LPkJ{^EYxMs#~C1#r#sNf
zCftny-Tg{bQW!@cYYN8K2r3CwO@Jdn+B()+@UlF9n?~wEJ1Z0h9*B{lV1&z7INr#L
z<LC>*ErUukXWn>^(p46kd%N02U!U*6KnLwB&afQ}54q;+S-rGoc5kNQjFAhgn|&YU
zkV4;Ghw!zmW_8bEfBT9jNSn*iLwH^7=CJLL!M<bt#pL;qk6vT*Ge|hTN%FlV%W{cq
z6F0C+A%6q~d6&yaZ{aF1NNI(ll&TTo0AWZC^8|dhlx&3+LJ(3`*2^8Q{mw)UlvRs^
zJ_RwlxDZU=F3s+=c+ah5Lm&5`!kVB%^=yY8B%ROp&7}QLdyR)C+PC*ESM74gEj*wV
zH3<6{h&d;M2)Y#b^L+tEA;(dGQgxz=IDq%i@6B|BGLANfkRKDn>}+^_?z<A7;%u5-
zay~jpejKNY>;9JTuPqC_qkMaW_5c^aBr3TbrOQBytPb8qy2IRPjRG<^Dk`+A++TB>
zt!4v8Yl=>T&W~)DK=tEVQCkUU3vA0Opd*f{I5q=0ky14H5Qf{F3%`p1OI$Z12ds_6
zF~5R0NuRl_z&@u22YokV7O8O?(~F)t>MdygHQEE}i7A1JiT0$8GSrW>dz@Cks?*U-
zlPe$zT}Cb|Y{1mrY7M_G23M>)=eBNbaQn$Da_Z+lq3Sz-xSP}5PuKHai_0MQ%IF{;
zqqLgC*R}z8ZDZwFPd?BO^^7SxN$)y{<D23wq{P<AKy)F>y+bVGrJFwe4d6>$;lbXw
z`nYeY!st^!z`Aip?p(Zm9&~`M1=n}-qKdu9(Dc@Y4dxG6hMfjGST{O4q<Z2`mi|Fr
zOn0l)h3&n9rqk|r>Ef5@<wYak78<G9zr0(rL8sV&pBnjb!2+Y~_!bsa3-d3<;Le_u
zK0}H+)Yr~9m6%fY;tf`;K*##bTj&J{ZDTBBe%1Pu!OzEs*9EQ4{8p8H_AZ(U<$**%
ziT?U_lIL+f=daz1=#>6K2&!RT>@_i}A8iqtng&Ry{(tCT#wY0Xw!!&yn8WScm)O1b
zoN~wV6$A|Xut3eEU4j#@HAD!dpJ%)0<Xl&KFn9D2hy|#PeACuE$tzv%HQMTDDX6`>
z1$yYN$`CEy%Sc|kr;*2Zq^&8;i2&n=65hQ5;X67J938b4CEg_RQ(g)Qyx{-Do_C_R
zb48VB9bAQrS&xK7U4LtZFS$Y?=KG+BoU<{r!T|?3Rf8R0HOLjn(ORZ^@1Vxg15bj*
z3z91-Jr>Min~Muge&YdjT%weAMELGbWP*SoI+sqaQlTzy&%bYM<fZ(GmB?*9uI|ZZ
z$WV?NkK!ef?vArwjbt-BLqiEh!`Fd|4pDuOO&5T?JoUPDDGfKeeFA36wnYAn$Q;7k
zrStw6D2rce-HpluAxF*R>GcRCzbg@4T=~d(fLdB}7$lay4qW_Dio)Gh;5!(Lc&};3
zW`O5JDpo8J+AU#?eT8*-=Kv7tI84i`>J9ng+eg9Jem>)Mt3pO2!Vq;)YThCf_ZbUt
zc>nDvCaw+aG`=AY+o>rkQ@%8uq<kK+N_rl6z+kRXANr|Ivas1n(5+|C&c_#PVPGdH
z7K2qwLKTrj1&ayF$TJxL(@ub;)st{r8q)^IJyv<<rbv6AvKScDYE#lpo(_*BB3zkt
zIOmslhTI&Y6VA)?x$Ye7=Lk(T{OfJ72QI7y9(sT$X#OFLMW0AUfR8eo9gJ7)exg)L
z@*=v#?wja<dBgbJh~`+O)5tU^kvm_`%O}%`HbOqdSyS#YpPPRM1ag2_6(l|_BS<8W
zaDx7UfqTpqGK?<&YP2zEpcy4LczbGY#9k3Zq=`Aec+s6dPtUs6P>I(51lWtE_<FzA
z_08zDP43KJ?VYaOQ1HGAi_&6!i57TAP&x=oOK?Osw?V_)D2e^<%}AJ|0aS}!PO_iv
z*2qpZlR!^{ksTmuk;pd&5*3btPL;!2Bqe64am~ZJ8_bx{>d~=ZtF^V?$(<xf4EU+a
z<8StDcca~~{1#qgwS9{V*Rhk_;wSO_c_Mk**1PI%4hHSWf5b;jGYA=sTic)aw@>W0
z1KCXEZ3ebOT)@Q^cVBqjd%y%~%mS(N+8oqTs5{9O2~G^LlvK+T#gAd%7Z;h8oS=Uh
zjlC>rf`r@qUidlkY1f{x=NawG9q4I=4!m7)Sa@Lc>U78ia5Q#qx;W0708zSe?wIAC
zK1&|H4<roFDLuhlo}}(BOh7N;Q|pq+gfBEHri<a#XXJU*_^XPE@NxiJk#SV_4AFw_
z^g(jqnDfAjbL|RWjQ9r$&f@CPQ1h`2>lmjT=(RJ7Fzmdu^N`!2?yE2vijI>K+Q;&t
z^gqIgy9zjC^y&4Z2Oq%QWcVY07qhMy%JKsrgv;@kC*$$MBYp4gU+&376_ahLr1$6l
z+H<@>MyS%crIgbncp1r`Hi)Z>^;u%6B%!CMOHrHIew6S1@zXN_v8VVlwD`Qf1Z&^k
zSPW?RHa>p-?61b#YS*!b$-_U>10^sAM0NiXU&FpPeuY0HxxyQpHnXWHG~m8Wug&u}
zpL$aHT!?h-v7vO{47q}fpp%uR$&_EZFF74}{9qXS+f2lA8#t`yJui8BwyZwIB?|DQ
zdui=s*JT5PL+e80EI!1qW^Oa8mH5Q@gmz@OCLl0OQ|2}|o32UB-M&^q^cs=;uEHGL
zh;bpaqd|!pEp2f#s(Lt}O@e~#N{Qv3fOUa%$5K34IYkQbyBbfcq#O0vmgBLq7~2G|
zTo1OSj#pM4sixP68TR#W_JR}kIcnn-JtgO{fLxXg@aRrI>VUoYJTGmZ9PHQ+bsv(r
zv9iV@D<ci~+i|3nA{bqQO;kE{#thld{T2Cp+*<9)b;sK13ntZ1vtYr`>42Po&VxcS
zUWgmmVvG@b7{k%J$RVF|Zo&ZIcNovJ^TOkEqbTk!y$m0cla8;oe+&Wy_cK$uiyFru
zK#)NK_lZy`O%^raa7kLdt5M6Mv8sXtlal%3j&N%}{0}oM;l9H^5{R_~^g(RH4Khj(
zcRCjEmd<W$MHnM-SV+M_BW+=EX>mM~yPxuFI$neyJM`Zc5BQqyg>NYJyLCe;@UdCa
zTg?;AA&l=?bp=HDIrx$6zB=FQOS-~O4R{hhZK_J&N(qpN_}w-%(QHFS$y0vD_GJ?1
zh=csqj|-ff`H$#8(|aGmR1K8RGQ%eD*f}a{u(8jB0hcC(FAcfVx)V?YqKrD5Xvs^6
zf$>@jN-OsSk5(v!JI#jC<}^KOr-B;(w$E-(Y^f`9W`h2<Dfe6$<uO1IplyLzgnG;=
zzB0eJxKvmw&Xu?re2m>3s1PayY5y3=<fM*fQfKFS0a@qSlL7@^<x^K=bOIGJh1^pt
zT}evT)78&v3ry|6c=*sOw{|eQZ*PI|G~Debu^1j`>f=@fGdDN9?5ZJ^mzOI-{L>VF
z4LLg;UUG)VK>GBLzB}BI-QTObYyG=5!@jP>I5Di3i@D%=oVpu&CX<)P3VZAHrir5Z
zzW3?w2&9-PltN^M^-YIi$8$`JwRbk(N-1Ulma5v)I1EQ;Kojf*OugEUV@QotgB9`9
z-RpOoMq9(U=8@J*<IP#KGuw?0)Ym>%h3-SEWVvIm#4wE~k)yvcsZUr+dzXaL#cvCm
z9+iBZ**ZG$?F~LsOY179JZC5jBhS$6=Q;>d%>@yZFCz}lWLWtZ#{f!zc!%8Vj=RfV
zenIJ)@eGcruG1T-A4d;6zO53+ls<1<v~ceIs5tl&2SZV8R}*W|rxl$gT*>sr7sH)D
z^F-pq><Lzm6URDWk(5GNB>c351+owQj}BHJ^TDHfirfVkco*70fc024YxA13U)0J`
zX5yEkn;V+4*>e#`yBKHm9>kqE@Q*@Qte2+qe8Dr(Vs`;#8ac3SwCSP_alDpVl8L2Q
zE4+jIHZ9J2RMxr>DVZ23>2<1T{!L>d&Swl@TxLKds{tazs9Kv0NFrQ?;cUi&Y5sh^
zNONz3W7@mHa#*jelp{se?b|)6SihtOAeu~!fi~HH9kr9eCp{W1nS0H%3($)mndjm(
z96?q-QX5(@pY2DSb=Z#_plo<F0h4Icwv(ZguCMpfd-%Q3?Xxok+kZE~ddV&8tmkn>
zHk+*oGQ)`|3OP47cfQ{oYl^%+q{>cpJxf68x715%NB(6#g#DV&F~eX_n!|8CT&4I)
zOXL3nZB4C~ST@=F-?N^*@PXA!Zl~zY;dz$$$A=*wBqT@X%kKhLAlzFmE!A;|gd4p8
zuGflE#Pj`(M$`6>c*i4_t;9g09OBi)QW7e*TEkih5&Cv;^Xm4NU;XIj=0${iON9V(
z))1lPQ#ew0S-xX)G9pp~bl)^WtYlA6hDvQp%}tPY15GnmgvhR?;oBQ;5TKu_e1x2e
zi{N>G2&p&4i^_<V5I9B692%h87Me={1l>Pz5NR@(o>}M1X(+D|!$qu)sf{lD3vw8&
zE;E$$yW2Ii&0HGCd&iF_QG&<7H1pudz&S>0pa;1aF57tGHJ@KbScz_e{j7k=z)weZ
zfOWpbtDAb5cbn!DW|T8JC8%7IlAEySx6PQ$CYmBY_=n;1O;b6=#5xnpiFb|sx03)~
z#|t%mUYL|h+Eb{947_*m-~*7Y-QH!W8=^Ftcy$s{1RV(qah8<P$>bi7Sj$o;CIBQH
z;e0aNNBPjR!N#Q~+3zol21%c10+19)!`Ts^ik3jG(&nVZLG%m^)Am`Vk)wh2`E$x@
z;Ry=hx>RK?<lHNgka2J%jA?t{b!Y`3dcax_+g6|S5a((veR?J+i=ASwW-?Sqd>xjn
z{5`wAa(7*7*T3mlbN$eR=e)%#otgIqnS!ou@&t2?)n;8_YaN>ugL@AwDMmvo<?rI1
zqu2lZRWY5`e8Q}d(%O>c7eN`6jJLE(J#>;ATWN1E8zL+B$aG9LTQW2`3$xjN&d1!7
z<Js^Qn}DxNev8hAOq3h(xg6qo5DSO<LAq<tWYic}U-yhfo|?rFP~7HQz8V$HflP(W
zR1ue>is<50q4a<O1aFzD095R~-0im!D+p)jYoozeGQmoz=Le@O9fhfksu9dO<!tz`
z`TX&!()r3h35aXS#X9?T8OlJOonMogne(d~Z5?O<)IqGWE4D;xFP*<o7{xlOv9|->
zuKt$XAJ0XuK?vV|ts*HP%hce2BO0G)>Jd>LeCBn!yBiCmF4dQu6~JFB?ka|3dAuvG
zzU%6`%CtM=ISLzeKfwb!aYI5JN0}^JNZR3%58HPFcQ>@wg3xBMjI3_4Kp^+~G{8tG
zndf4*8DSu7jJMAwi`{ri*8Vn;SIn4+m?9lCm6(>O4pE@v^a{S%<R(qVDq&-6eK@DB
zx`?EY;Vr<x4VYll+P5f!_?%+K)Y?vj`t~1=B3A_;$)ND%Gn#+!ny^%TH#hn?@$vUC
zhKDN7F~T)-nz@_XeM>vP<43czVY<Cu$GgBLNh22nb!27HztsN)eJ*?n7eFgt-2k=+
zcalPKHI}ITw7K2NIV!d1<P0f|*Lv(?@@dz^wqhdjb!!^`NkqWuS`y}NN0mHdJtCKi
zno$wf-Jj`6L{_LD8ISiqla(P&utjVq$i&@lvXQeam_&WKb7tVx^w^zy)o&#)JV2eL
zpYJbRt@Fw%r6}-I#Ow{t3?}fvnhc=p6(GD#p$yXRMQi2p=*ab5IrOng)-wb5Z={*b
z4-%T~%laE>cE9{F(0N&!KE6;&>_C1DCq2_<_9%iMYl@=uqECy@$DGd4gTO10h4`-q
zLMJH%)LEpo4h&RXUDfVC!sad3oN?ezrr{|0t2}`=xC<uj>KAjs-gpDxvw=Lj)Vfaf
z9dRDt-T1qu2Q@&*h~j-8CHPJ_zm4>is642bD$0*YcvJpZ97y+I%_Y+i$`*Y*nDtl=
zuGk4RSDzlY+vl_2*_~{Ok#RBph|8ybg(ByHxfkG~us7(LffpWTq5>5*Xr8@1AhaGQ
zj=Z8TQ}7OMlz@8`_0x^0pe_}`e61DKHO;LW@jp>O@)1w4!Tbr!>H?atG^H0#F$Xfk
zd8>N67I{I$Lzf5qsMEekiq$deCU2%4a~LfGL3A+Gd1C$p^hsQ==tIZ98X2&_G`0GU
z1_REg7gON1RnPY-`LqM&HKn<)!g5<ehyfLc*u42W9}m*um|xzmcKY<NUM~&J?BJKz
z&6aZTHrVCazEP^zf%pd(s6JpNm)w2t+JZ@nk*QrkTMPwGXYgbFwQew}z73O^uJq>7
zg@;k^NWe-NA<R{|l$XqM2gcck5xyZjXn6pO;gI_J=v`0%+%(LkeL~*J7v~L)h#Fr_
z%2(OTxFxYcgS^SKIdXeBPb<Pco&WW%0^$YoXI$)$81%7<^K3IIEpInL%1uWqw_#4h
zSJ=}ErE#ymmo>GH-hl8M7Vw`ejxZ6QzIHhxXK8@!*X(L53r*bF9Vk8Gq1bX(*|}KC
zdCX-9f8_p=B%4;|pUO+ezPLcQ*u0CSh5tbCmvcnfVLK~BUxUXE$qCzGYlpon`czYk
zQ$W6~0e2U^m;O@xGc7plA5e_}AXVDhBLlN!g7`$_=9q}~R*Gm|Yed*;A7^y*B4^7v
zdSeAdMV~zDD;ZBgW9%}nVEscjbsBEk3c~0g?!2A2Q$G}(C<?i0+#I$!zc3o#)y0l|
zE}9nL#HK*R1WBSYSN=4j?2E*+`_4WjAo-V2g`_`Y^%Knqzn!g&w`4QV9(CAD*TKXB
zE+7V|v-_yS!pAV(Gk?B6{Xq`Q4Kzg!U07F2?ji$_u96uM0IP@gk6-O6f;BGHj|33|
z0jX(-0tZIUqhSYi;l)p3Bf*YCtMs3BoI0F&TQTDelc8ebV9ru$+sFAe4fGnbCulaW
z9`@5}OK4b?+z;RA>ZN#uzS)p;T&Cq2!ea0xx7%4=UPsaYR64b^wlrwx(1Ar1Pvd0$
zhdc_eI3Xp4lxD7z13LrZET4*FZn#7?!~HoMZ>QUMD$+8Mxt`dgn)C>j+t23C7{>rm
zTYoibcBP$D=Zh>n#6=LA-};3#okPw9X}Qs_`99_^=SpX|JAU)dt{bpF$Iwg$jRwGN
zIpzet&ksCD+h>YNrScUXmqj7}G3Dj(8o9e0LvM3UJt^P-GqlzP+9vPcvi$m<7$5AR
zb0ie@`ODLukdtY3`d*iBkVW6iM#X%x7eu-_NL$0wzC;dzxPfvTyqbAHclY1E>xi}W
z2$1KX2q2OG)`gU^3X*QoyIOAJchoRYiWSr)m|5YI&@l4)NpT@8plR%H(s$xeMfPeg
z3@2g0Qm1CZUU~bMrFOzu!ROrMskbEfWyoS!^kLyv4IOARS%3gV>b;RYQ`s=;a2}y&
zvQg1zBhB^bus@1~^%%sZ*j@LoI0torQB?QbZQC1SXghC{XsO?$8lrE2pfIP!EmNz8
z|K?l>ZyEy=Q0)cO*^`rzFK;R_`Q7B%8%Fo&aM!0e9KBhD+0eSQs&)_aE?!3_3DH3+
zGItyW&Ol!`C`>*yE?L8KAzUK|q`w+yK5^nh<X(InFQh$q6c&s3Z-<xROqyt_#TSc>
zB_4m$j@tR=SFbyyM*T0|6ScN;4`cA#N$!yDOZDJ3iF=}V`Id^q0ytQKLztVrgCk_i
zg!{O`m9ePm86pKfh+koP^paBh&I46?zCN0JjQwjSAS^swBM<bJ()ryrH{Na5z2=P6
z?!|V00(ze_`X)0oFrAmm3lfxc7P<@EBODL2n;nN}3*)T-L{i@`@!DKkTDs_mF*F7D
zA_UcW{1QE?dJK+ya2;3<HF6-z;_t{t6rnvhI!x#;ru3Y!plict4_but%fpG4^Td(2
zLy7dSMutsVPt%6(lRn1+fLoO#?H&s9azCD#QfVJ0KakA}!5H-XRNalOB30*O43LDM
z@db&#kdreidF=MN+h9Qf$j^Xh?B}=C8nKS6z0@`B4~C3p)wGJM1d==(Y%M{fP}D@t
zg4FKcB_L!nTy>c`Q2zJcMLsyO!0u}|L~!~|t)mMepc?f#mJY0ljXnTNkpu)EGzFu6
zRgWH)kbCM2BP#_X_Iead$=Z2}obOcsQ(8BBQFN~?r?+81s8ov;TGH)7rVtv6-~tH#
z`$9BQ<M2dt<mO=rvwb~6D;`!#Dm&@gO(FS{cedDXWsq)Atx)XG1@J593<)g2IIaQ|
zlc7~zzZygIrAwC-IW_i=6UfUnPSn~|JScp2t18p&#bh#6z<blLy}RPsp&4`?+pS{K
z1xkQ51d+n35IqA#Tng@%?+CbVBwJ8x0AwV-z2KC^n=_E0ra5>a({$5ucPwNgYIj?N
zxxrAM6-Q5J-hG}SsI<1WT)66loFP{pzYdP!o_hf5oiH8`6LT@fT10*t94-}I`#q(b
z<3B)+`1&u_a0$~Qg7)k3C*dMM>elb=y{iI}{l)VdM;|(QAB^0ZV>^S5i;9Ua3m{$>
zg6rjvqv{<Q5`kLeTr);vIt~tERh=@mWjRzMR_12>KDR7zGCN0wX`3j!mwOR$_E4vl
zzyZ_idHG~qiHknry#Bf>S0Lr4e402vU;d5mOrEE(9%PyQ!;QZvI`nDe&_Hrq(_~9K
zuv^E1!t7{(2CW?&{C$A=-0BjKl1cR{(03W@r_%l^m}7Hl+v2ffpq0!^R{~s17*&FZ
z%_swjz~I`#P9owVcP2n9C%_nI2pW>8;Z0cNO61i;-4e<k?~b3c#NA`(RO>(s<=v7l
zo|K4R2=>XMCNBk;LinDZbY;mfkvcs2hk8O<9R}SgL<3~fXI#MhThO+R8x?G#Dk!#m
zfiIIivkwZ=<r#PiUyM#q_dVwab$Hgdv2VXF=t2v^lLQbb%KVkGk6M&lt$(uk*~0mu
zOO_af9=`cxI@ZyXY~wsayp{8KN6@rfpzP55>w;hE>py4IT+lHjqKb_L(i{{xO8o~B
zdV;0`5FJwsIcbWf<P)@8QmPt3S_bmtUyt2(k+*2cCrcpq`FjIVEDXuCBoh2hp-S(Y
z1E*&&=ttuj4mE@J$ezqV1p*{tydl;9?WiCfq&kMZjw=>(H66*TgvSe1KjUWsvku?`
z4vwO?6p{h67R4X4+b1&7H3?iAmu`e{H+DSg4N+&nmjLGH$Nj#ez!OZ#Y^l#+tb?+%
zuG7cOx{6t>JJq|hE2EoD7kMQtVX<r&t}PZV{lrL&NNW9*`ixx6Eg=~r@0tc)34+`}
z*8Upc2!u_284|KEeeIC;=h`qLtU#xgDB=^S&q#ax<WSQxuKw0J{1E$T3T{$5<RIhm
zy8^8wexv=zWj}FGy_qD$T<F_w2asX>B3a2v!IK^eU0&t_0kaMv#v3^#ne17DpQzQ|
zS=-|Zt|%gr+8w0YJA{;AN2#}(&h)v(oqzLo*YhrnE-e2BSf|~YWrp&RQk~>h*Xpkx
zaGYs|ze7gny?*|s&8V+`$;+7!92YdrbSGLs{;GQTR-svOm$w+e8sqFhjQ+P^>CJNS
zZ+Gf(NTxIiJMTcB6N}ybJ2maz3>2=}*$V`9#p=MrMy3{n?1aFxgvr?Jo^rE=C;D&+
zvfT>=8PL>5uIj6M5h}5d@AH0LJxxdEU?Ygh)5e4!z?3??Ra}}D>KdIueYP~V`@Aq4
zc*oF(I>K_z*q|7qgUD!<tr#f~LXJ~Ao+qFjQ7C<z!EYW-3yr~c>8!b5->?!hMBlmx
z;z5qSP*b1n*@bp6%ii(3b3zJ$cs9gZ+{x=dUwsUFUS7>o312ee(aR?VD3sTtz#2sl
z*^Q)-T0|t23y%xV-J>PR#`O1JyP1b8RcKtRMOuOFwE+>emVz6<y#WMT#`+&nS?``Z
z?=3r{2i^WXHO<Ok!w&M275{;}iGBn2+!UfHFB`bLGzJXx<z)yb0TpMzOUH06=~S8r
zQCs`c_8&%kICza}ENah{U3pxc<?>X$*h+>THZUp3WH4fje2>z@m?bYH0+^}^p7o<&
zEUW=7@YfTg(cppmG13CvfWHvyU9nvsy;*M+w@Ky!;Uv;$6%Y8{j#JhH+lEbVSnKAm
zz=j%t#<I_`d0#`BU5p?t=OU6xVVQ094FS|7v27nhZz7mfomlqr?iI{C4Nfd@K9Qj$
z_%z?aFrKqR`!#q@QNCdP>Cg%eJu%1X(A+cJ3o9KYav#TG<tnX=|K0W-MG7hPG-+ch
z9wRe_Iz_pknx*7iQBH|Ht}Bf0Ec^FKRoy}n4%-96tiT<FY>5!_HV=CaxF=k`Yj1L!
zt9eOVyDL+!G+SxBuCI~ycI!`>m9^OOGLvWKFKRuA3JeXC=|&o;KdT+NT?g)uwDdn=
zPNv0fst->i+F#lIU(6ss1@j5yyx&GHDb@AmJ8damv8v%;BQ7_{l27&_IsgoOPinwf
zlGP)nlfXm_MXlkVrwzsmv5r-ULpm}~36NsEg$qdj)Y=0mdru#m;z=@YP^H2O%Xuht
z<<<`}k^+TMNM6F8^7{Sg2k}iUC(UJC_~4ZGU`yQZalnC^UJh07NEvU9l;Z<R&D-sr
zCdSOxRD1E0Jnc)`IXfF%MM715<x<H_h9*@!hD7!eew<hQY2*sB<P1CZXa#A`^qEk%
zYq#@0iA({U@ikTD`@9WjRriz}tI~kDEAEYJ45}C?lRYf=v&AYaG7P1uds!Zx(eoAo
z8Bq9F>EN4@6;TF0^pOD+cvLV(;3oQ|A+`j^%wTlJ$4KM^j)&=i?Ms$`4if+iX2WEq
zMRcP>Lh1oht|k1C>kE=~d0H*AX=SA>zRoM%eSK}+-?1EZj%{H0VOSSQw&<~)$RgU7
z<l7?=d>)$4%$3;FlxE{+yBz0<=J4&`^YqOc1+9NlaRLYnL3=q?jqg24U&Y6lcIUJM
z^m{^wg4=-uV)ge7V;TfvlobCBb+}e$a{7%q7U6m={}cV<H94C$r6fyfEvws-hz^Ym
zSEVG-r-n+mR4{iXW^mE;m-sIAt2pR6tRZ3*xZA8JPv{nrRP<LP%hJBb=Jglv{pr!v
zKs-JB7)9sT&g}<_xT$|>MO#~cBea0$g6fNb>U{7%=MXpvT_!9WSXr_R%dZC}uGX&-
z+vb7-I4^)c<J~rM=k4xP*Aluodi{y^?q62_;1&Hhsj)C>(jXm@5(O`^DV}c7M<K5W
z-1l$R)P904_O6FO5R_DVnOxMh@i0mNFtI0QFetv++fimRcNOxjMx1PiRQ2D=bEJ0H
zx)`As>{lJ%(in3QvAF;4wD?Fl3g09FRcOLo=T2MrP)K`Ef?=^D#oKW(%udNe|M_PO
zwsuVT&_G~l)Xvsgg6+l~SnRdKzAMVo#$;5<P5PI4s(uov3N*#sMc(Wg9vMI%!SuH6
zfxuFgL2peD<V>gte@h71RXsPi@A83^kP^>croX}AZWn<&ol>&f{u_m)St5!c3olvr
z0uNRqDj5*>BrSyj++t~y>g9&zRsjJQCFfoQW1=yMIN9`H&1L0O8t31p5a5m}$uc)U
zu8#Bn*RQ1B7x-T0r}ec*wlOf|%|3G3tg8ayPOp+;Je%$AI0}d%G)#c#rxE^l=jns^
z+Cg5OA-wNst9<vjs#*8v{TV1vm64)13Y_w^6r3o~N05bFnt~{*@U!4KDv);^M_t>v
zp#^Eh*|-(Z7-Bw940#8rneJzbtSrOgLR3ty>$cwiUH0=KMmWkTos@bZynbTU(U~<S
z6N9H6;J6Ywjl2-{H~0d%A8p-ld`K;T4%Hb_;Y+lNwJ7CN#?m+dgN7|2W*jv*$LDce
z@!9ol028H%t16PjPBkVg+e4?~`ljubTEj!F^Lx3*0-`u7NqGa-ju-Sb9>S~s-HOoY
zXAv_S#5c1q9Z40s55)>VBqKf`ZMx1MA;VHFXN%cg$Dd8g&os*$b&Ce5*xM*&h6LN)
zM&8Ww&`0d+GPJ-3m_~0+0pa@^=e9_3``llODn{^dGo?s{I=oO9nU@jPN<P3G1v#{v
z#6QbrpE4u%7s~U!x}je#UFsmY>D;9F$J%LkrQg}^xUJw7g6!E%!RxJJqqKmO@N+L$
zgz{$_o%&pC6^-I}bZV1D+JA%VwCG7-Wd1-Y=XmE-q}Bmk*1?200fY>{%%KNo;$GUF
zV~9m<|6XRX`zM+=sXp0tUbF5z`LT}&Rrpg;_Sk<T)VH=+n_>18C4#%&t(X}yxTnOz
z95|I>GE~y9=2hklk))Hf+329~-}EvU^idy09UZ)THX?!Y1NXU#$>t)@mftzkq_L!n
z9k&1HP3X$TA$A1V-Ayg+Xm$ObSvSHO8l<`I?H~tY4WX^0enRP_z?QmaSzwv%S{}SD
zNS54a<y?fkaZu3ly8pi6mVICd$a4(GO?LTvkWS5t9|ZK%*j47RJJ`|EGo!bPqf>F)
z+8_ziy1Slnt@6Z%3;bb{$bBUM#3?Lt7@OA(jSA15eFPE3;C&eq#l&I!a<%f=!}6$3
zShGc`-*vVdG7MfvX3>r<E}nSk0&<~KYeAt}8b=CXCDIw+8f=e1wD_b-Z=!kACv}ZW
zVDh#L+*n8^b<DS3wKP+S4hPNJQetk1qLb<HJ(Lb$8Ss^uW@z3!QtsI0Z;FIrihd+{
z0kW1LxXHkENx0Ui6>sI+J=W?+9E}HV$f;k$c~y)A?=I9&oJx0&T35B<zp20UpqiH2
zo(}Z01GpY~4XLU$w6yUuJrmNVz9$C$AmmQ&KS+=Uahq!lR=!>HiTH>-=zrYJI5v=S
z#;vBuhv$!q2`Sb4=@ogj`_i6*qP^g8Bay(S24uAZTNY1Z=`?f=QJGgyc8<I~ld(Do
zSX<UPy{nfBtpE=P;DzVzYRWV0{a#_a>wE=}=owWr<|f>Afg&3_SZpp+%T#z=VYSv{
zLQ8S@LFDa5AH;x6(bD2#BkdsOi~1o9g54YUb+K;y{4Gimj$+kgMOD1D1lXN)Y^<WK
ztMK6Pam)J{q%YCMK@5}b6%e(OdlZPf=C>}}Zf9Ah$I3|ym`kUn>PY;<CsPo+OqW5<
zsF-lPCET<Ku?hSO-Whl#f6ZqTmXfJa+q}u~hbBn)!CAPHYi-9D@>$>Bk0IH87u+>Z
zB(HESi@Q|4?4eJU>2L@(JZu0W$h2L-1ep|)AXpHtD3iF4cf?*s=&^z@J?57;P(p~>
z!TOMTz^NgTKJNolXAD+$I8#im?Vi3@!ev?Mv?8MaNsORE==~Q-C9P{TP6mDkXCN)%
zj+JZ6gryp(nw&=H2)GcN+<Eqrm&umDKm2D935iP(k^1jx*{t(%8XJ2W>p1Z?{EZ?(
zf@Oa!SWS)im@&2P$_~phW70{H##S%;?(A-?_?jZVO6waLJZjShk=Z22As-5q1cL-(
zddmV?N3R_#3(pmc{Y%k^wB9TB0ewWC;A9=OTYtB(&H;bh_TsBQ)-JY5n|#0L9rj|N
zY;jTKy{9PN=B;bCzMi#RmCxvnotGq^G%bKhz$)J(=V~BR`9*4$P_AW(S{BHt4IfCb
ztqzvCKcDH(1aK07QAk%}#@!vX#DM0-k6?`_y#mxh=M#BUFKHqMK^CewiV;k~VS|C)
zC*~tA5nso&qmVuUssRebs19N(IDaCVZ<ykq*DVk)WH0F7%?fZJM@EF33Q%AK6jR;Z
zR*XtSH2@s%Tc8IorMrg%yJXZ`Jc0<^9Y)9x>LE-V2CUv;0kzC~@e;aF<_P>W4EGA6
zgMD4~V6?9684UB%JT*W=xE=WELI~pale^rB-AxecDUbpz`>er+H#8qCK$dM@tK9iF
z<Za}e=xs1+&4q@CN7*losP>(|EhEilJ+)9M`iPaSn6DKIe2O_N&u;`0ndMYD&r9xB
z=W}mGgzuc<^BYaZ$x<-#{buosPs}p7asE9Gw~W=2xs7ieZt=fA-VaJ513|I!l1+xI
zM=VlWAco{!EdgrR8b+5a!Ce9#*n^xASIb^lir(GC+g0FkCGhQ;fD>JY*zuaN^MB>8
z2PV^B;)-Cgfy4N|sdaTHh4IZ+sVMIBlX|s#J_}t>q%!aVD5_Oas$)oDt$xQm6P?%J
z)xNwQBu~*PK>owD8^VKUo%lKHQzNY|iis@5mV}$mkS4P(62|X&Ix-CfFUznU^q$lG
z!=tpm*gO!EErKFk6o_0poF1xL2e(9q-RFIbI3d#2S&4x%!&q=+XK<4{F(;WV`?^fc
zf}V-9_mXtINdn4k`_vm6XPSMLmDvig#-qM!C_4BpdG5OASz~^KZuk4H3*I${Q!`SF
z#66VD%N#Rm#Upn9Kr?mq<$snJlJ3-H(^Uw#-l1lwUnFz=zf8_JXk_Rl%if2HO3Z+L
zG^=ThMtx?+BDvlq53%0elhS=D^@_USeT^h)38qQPO;&BJ@{%Rbw}lYwm|D^u<7u+?
zaVD!7_MT7#ejZ!X63%8T=glP26LLzOW$(|VkjPIGe#&}%-IVPIEru-(4Zfp$QA4iW
zs!#vme@{2}BLmyw@=34BpWU;-@;syS)c238K7cZgE7fo=NiHNm!)4*HF$M^Z1rSMz
zk1To3mCrm&cfBQ@E*87z9d)iKmEm2%bvUK~_q>9{hlEVj=1e+#$f`+^@cb`SehMnj
z>1_ZOQ3|P(mSEt#)g*z^$OyplHP($Stn%M@?!o_#kN_-VR#2QEmC%rU^E4%2T#<0T
zXN|ffbZ~R~g4@WQdh^89ouHngEI#-_96@|bC8z&tDz|L2$#+8OY=6ew-G&IXWNX9@
zzJ4wwJ6Eh)a^q*a<~fj6X*Q<cQc(Y>k@GO+dvh<8H{Zk2wR6da*=BvzT=0HI^5az5
zj!B|(!BJ;P*`o73PCIoF<h3Mm1gZIab>-Is_OCi3YB9TP&J`EuIH>_^o08AF%NcKL
ze2?k=M%)EGbZoLEh%uATvr}-qC$-jCbKto%HIx-dAUJiuN6xV3&hT}Wsv!wtpF_46
z`u6j{i{~O*)sGZ?ZnTIE&_^T8Pac$EG}`(g4<&g?AXqA$q|$HBKEOt&)!Y~z75)bS
zdq0T3ly_F%neYQpRr{!WWEh&dNPZ!=7qgE#6514r{)PM9i&71Bf2XO}(t=+HAHT|c
zGno4E1*Kf4-<~p0dSEW7m~?H1k_adYC_3IQQrH@r5@0_VM!tGZy5JoeZ$66`#cK37
z)>+$!F9AOHcKzGUQE*b)Z!Ya_Mn{K5{o1q<ykx>%4*E{^!Q)3vxgBvoH-RPD726g^
zIyn2QoZGg?yf8cJXWD11mB$5ZYzGN^kUHZk%UF$5#t!t?kOPs<g3tn^`^PPzi@@ip
z9)JK)Iru}t<KM~G-2YWihPc-ci(#o=tscL^ukE==w88cGA4gJfC3wfUADi1zxn2HQ
z>G-D@mmYo2(bDT8J<9>P8P5+kjxB_Yn)J*86=_{5oUQH#(Mxn4NftZorhYtmpvltR
zPm8ZZ%x#uf>svi`yOGvJ?hm%}w(-Q;L6PeFP9oMu6oxMDo3Vn5My7lY@>RQif%w?n
z?m&P1$e0p6({;cQBQfl^F%~SRUdedW=aGVAa>XOX9LzCa&jerrJ{o;$=MUpv1s;n(
z#jCZ3J-<AjrwEob05GYC$v>4<M6$MaH@56a#T5|-`EG(^#mZCwB_|zs1KE}J^g@4i
zY8L2M{=*ckP}@%b^%MWoF;UDRF{)|8t0K97pKn(D<~^zJ%R#y9;fuv!nmmB3LQ_bD
zH+%0VPuogs+D=;)ogvkkfc8^H+;0VWPO7VwJ#zS(yyE>>37#A9aD5)5;Bhf}7Hzq$
zxHoQ6HAfFR9@0X3u~$$Kp*PwA`vvlC=gi@5P22zD=-dOD`u{(EXgDr$hBD<7o5Bzq
z(rCHNr7V{anJ6rTR7mcZ)<R6glFLx8xuslEZnZEXQ;CXP%4JD%OG5aa@9+Qqv7L6_
z@7L@3d^~PT`ZB(<QYOAfKK#)gj=Y84U6~_pcW;Sl=X0LV5-TYvfDeP~@rtZ~j1!}C
zPwB8a{>k9Lmq4IqY`zF)fn2<re3Xn<^dcYUwmbLyb&$GeAK24uX>Z-}{7Nfqo%@qB
z@^%&N!GKL!UMMZE`5h<IaOr!#$kCRq?OtJ*vGKU7(ZlAQ%s|diyof~|ysw9)4=bD1
z{=*llnpe?+vInR>!H;77k_Ab2Ty#51G2>0(pVsJd?H87vsUnJD-?P8N*`JP7E$bu)
zvVc6?Hm|9~(%nwKzEwZ6SY(o5U|P14bGjN^&@iiCMc-XmXY2K%C2DzkWHInITVpv=
zKcyT`Ut(R?qG+A!G$^dIyLEhkWJ>}&?7(7{*27k6+U(S#fQZ1kU^%91P}hBF@`bq%
zfmphoQ6#eKw1PnEh4X5lym0Zr8?lp+Hc*sV?$RnP247-%3}BM=#rNZYWH%&=Hr+dN
zt!fz_hNFlW-(tq#@t1lwfU2w`nR0(Jv(ApJ%ISJM!@&3Wro>aZ$IIDu&aYAKaCi&k
z%MGZ{d;1D8U>7kj6R~8VQ}xb#HD^#q_3k9CS1*7|x{r-xF2|>&*w_dK)T_LI>W@;i
zdzW_WUE0Xc@-jH}hop)vpAyMn+FNIdK9POOdZJ&>mF<JU^9EE4&8b?Sy=b#;y<$3y
zW8Ag*Mxd1bJLl`xpA6yYpX+~T`WI)i0`@iAlVM4~QDQUP%W#g@+dwsbQ8{2}2$u~0
zg+6zI@tTjv<{F<CGJD}uYQN5H`{Q&-iBQ}OM>=s=(1CGw1wrnv8I6nZ_V94S;X|6A
zF1ViVCEtQ3rnEFPay1<r3+ly@*IyDS8qX$U1w6=ur?ss<&tGd;4Qcaw$iQ<@84T<R
zwn+lo;X2_f%m46R{+s-d)mn`mUWfvej2ZK>$nBAa!GrRKTdD-Vjn>J3-|*es=gUNm
zo>!>L8(!acy6s63x=)9~gCf%HoU3}V{j{DMIqTALTG^)yRV%)8jn3EyqhIn<;x+ru
z*{M2XNuS<cR-4KdJ-Z^_qu*fF^!e=34<`*-WO0~WG%|UL6>Qi2)Q>MABQenKAdKPu
zGwpS`E^B)6DzTl3f}IRr>gc%gYww&tih8o`Ozz$84B92-IQL~_uF4KpgL=ANKl#SO
zV)W^7@3rqkZ~>Fg%jpWm-yA`fzUM!7&A&R?*>Sew(-16*4x^k_zo}hZtRb8H4xWtn
zj%>C_Yn5Ephl5XBX?8AE#dITM?Bv$8HF?H_#e-dA!4$hh7{*(5DiOcWehFP9Ejy8E
z3qCqDeDQGN2^o0LeUBByh4US&Y`D);aoE}Wzv&MWQN4WfeclR2s4@jBmHW)bY}+6~
zauIGI&1rKU`@r?6XI)&1W{;QveVl``gCEj{9Cx`={;A&C!|bM%d;`ASaG2;WjoXRB
z$f%PJL_fg)yB4EJe+-}R?a6dWtbo8C(=I2GvvVf0aHu8`&;J0gL{5Oz1>TgMS20j$
z*&fJBYb?^TI>8kHb4#%8rl`GfY?buoY>|t)dr31hS4BoKUf8KiO}z<d#G6c%aaX1&
zcH#F742{E=#-FZ+fs0K~|6@8@^-Gys4X@_RE&jdedY@_G6UgQ2*%_TA&%O~$Ol3(=
zY^pxHP-lTc-pSXvhk8HP`$)QmJ3Dh#$tqo2qd)MuC3?CiObk7V3KS_`X!Crpdev11
z&xlvV{QG*?FpyI+Q?u@%QdPg(E{$RJQ4q!pk`ATx3GKznkRQH81#*q5$4klJ!@u?X
zM#2vmyY4T<+>CCyAb#$BmYRMZq(X10Oq=~^r*K?#dLnW7nVlFJD<JHCcb-oc`#?Yh
zeKO2NR$G9qf-QDx3ZM7O!xq;8S)CnW$3Q6(hxNnxxCmNrb|boB%%0RrJbH`e=WSmY
zLdKF6yw%&sBR|avm9@jO$;GR_O}B$6fSt&)mK3S}JqJa8i0{pX3<&aa(hpC0acTzR
z7PDVD_~Gu5?YcpCM$l{|H&ga4zw`yZZ>*i8k1FJJVJG-r>nR=0!AE-3eiUa0rlzmC
z1gin278J#+vW3gUGfkM?9MfGxs#Ww<yg9E@;<{v-osv-o!}Vk9*7W~;B5zk@`<4Ch
zO0U|l?kn|l89Y=H856S&qk~uHV_F&Z))v_M;W_%r38J<+yb9|b`}e0Ib_zyirLVfz
ze!duTrd_g`2U{=V-BZao$E)glw9aYp!t!!T!k)!^x+sJ_0Q?oXz4FND^Cex5j?rIF
z7KXo9035J7Aw0z*!E3;!fwFg*ac~X)5>uKTAEv+lJ!E*~v8-J;^L?)uVRQT8dbL+i
z+HXs#pt;uHgBuBNu_qoVH0JbAitY63EeY5<*n6S*!INZ@r<B6$f_sd1I8g4YYPH`v
zqcJm-D-(^mek-XtA(1G%OZm^z)YxikUy#ACeex>BpciKY(}S4k-#@h<|H{*$1I6X(
znx$6Me7bZmza?B8|3-w5@9OCBGFnpLsnSPB<irP^5rE<+mEzn^=*;Cfc09n)MNRzX
zBr9C}(Jh3#tw>VoS+1*7&b)#yFSqn9CaTF!ak2wYu+XnZ+DDXKN<1^<C0oH4i;)8F
zxM#+>FOB*wx9=wJQ3b9Kl$TW`^=5m!Socq~Z4#`4DXVa6(lvt@W{cBkUZs!wWZ#Z2
z3>J?ysM@hiRHV0qv{jJGQwMOu)%D_RA-bEZL$K=+@7zQEMO^>kRTF+?kDtBoV89`&
zKzG#1B)~-@dmx-GgcmD>^kM-z!Ov@WPN^@cU36dkb%hUac<C_rW9X>CzCX&`TJ=6V
zjGW@B41q%RyZ;a}n7%%qZf+|YVTmhN@6k5@zIOGCyu>CU4ieN-+S&}Prvpl1I<Z~h
zns;F=GQ<8lLNEcU87wY7rldtpUeVq8wU~ud!kbfd&^otthJ%iX-3Y#I+fAtF?5UE7
zcMT8h^RjZi8`r-)^OF_y_15b%r|&Me9?dVdW4e(qoJZ1+<Kn<G%Gw%qh@G8)<bB!y
zr&t0aRi6?wwoaEgyYNNuW3IBf>;o|*4chtcCQ6%aEEZ^o=I2lxnG_0+Q@_Z2x`K9b
zIUDt7O(tpT+pj+p(J#^`>P8;vR@{8q{7BUgrc0?7qhy|sL$Lv<z&rkq73J#huYmTu
z;aeRq?5yK~1b4a4tw>#GV^H`Nh^ca*Bc=9d;vh*eE%r^fA<pXjrLInijifoRRwslX
z9>Et%@0w`))hRzZYWTCJc^q>CdEo#bm~CWkTUkCAepJ)XvMgn5qBq&?%`w9ni!-?i
zIuM#%2G4;$KAn61U!4$`1~jj*k0k(cK=IP~p&RIgJP#UpQ#q5P{CNIF`{B1QVjloT
zU@T~l{V7~{$EzJr0!WOqp0r2j(1PQ}i3%8cq@|TP9TK!rcJ-j}fwr`y9gr~RHD)JY
zAWZ+N(q`IP%NEmv@@`eCiu&c&a)iz~c)EU^dAd(=;eYzAL+DWQt@`O_W4V3adcGM`
zSW!vpDOJz6M|q5(JmEXIGRR0Hl(O4I9$Qj|abDBc6yeatMW`=T1Z;v&@L0wmVjgD>
zAQEjIWMEjbIk@$=%_jX6%AMoXe=5Q4&aDS~ArecJ0~46!l<Dt_pb{b;AgrCo7E4Q+
zXRWf2l)~v{4?3#Q1>~Ck4nVE1MLM2#sB!kj`%<N~M%OILSLY3zBUagZryT|c2G)Q4
zNDF#U_gQ`)fN=e&N6lS^UY&~TpvRW~3p?zSye}RGoXVwiGV{vIe=PmL%&9`A9F#l}
z24GCWLM!(e;^yyPN7+THWYi9-80k2Urc&JbV0kw>bn~ZknU9>ZJW1@)Y#c;#9q8$h
z<bTM#%C#ji>bZJ|I}-cMQq$nSgD#Ii(oqVr8=Iq)HUc-u8Vj>>!Zes^==O47jYpc(
zJyTOt#fM7ftlxz}@aFDkqx<(`^P3lS)e_Ti1@*Vl&b;n&)gBQbRDWM*fMo%^{NSnI
zO~jEV6gHw^o>-M#TveGGeX%mDj~4{Qo{}_mnyxSA9Fg}(wX^PL)(C@zDC6Jn%!Kd&
zbUuu$QqNJTy4h7AlzGz6Dr3qlk0WtoaEPL9k>&URb`h1=5DS?~rj7Nwx$QH`!;eJa
zKz4VHM&kR8D+VDcDG_>Bg)EoAXC_P+k$%FSc>u#oBWh-Uz8Li6ng)^hYifV)Vwl4R
ztKvFYb77A<u7l?xKQEAqMjbfrm+ncN>JdicI0^hbc<)ekb-hp!LKVs?vX+0h3y<zS
z5dSoIP&&*YN7GIV*B8VVDE{EoAJTFvP5XS6J^5y8cF;QkA@u%K<pa`6bo-3SLtOot
z{eS|$l*xX?#&~?|eB)WNY_rYmc5~M<Ms0~`n%_JJ#lmhZlpla#XcY_b)QitQad9(!
z|2x5sgr(BX%)ipnn@1Efk$yWF$m;;`l1bXoIr5dEXo-7}<h3<DZ~~g>l#pk>M}7m`
z7T6Il!{Tr`je}=S;JTBVvwD&4AabG+c<I}+mTtJ2?%kI^5NsiweYltKhka)L#;F^Q
zx-`%O3j4}}XEpG*1G6m;k=Jz&nE@Q1#|G13XhH}?hggT^7AKo6JpAS(LHBGWf2#~p
z<A<vng7P%^xUX%!8)KWlQYPoOHs%jkH1Ya0f9QMDxY>cvomwIn#f*!X&gz2<ZQ$g`
zYlA*?WA5vYYq_F%PZ-`9BcNQLP+BR7aPniNaD~WM=*MNft3vqDZ<_lHEw|NTA?4)L
z3|xDH;la}PgvHl^vYVw#j{o$o{`HAp!zxRRm!337D1^&<{5BkWuRx#KwbuyrAX!|G
zS@T`Ac*ZYTB;r1Q)F^*T4d*4B-@99ZW<Oj)aa@0>Ob}7Z3`)eHIt{)YVXs}Q)g6M!
zOX@c4Zx+^r*(k*cX;X&66>j_Y(@ATxDN%;L#gb>)6gB0UT+s)y5Fdao0RY~?zV9n5
zyXaj<3@zK%J|No}*Wib@pC4&NgJ|Zk#~Nr=tte2(7B4Gp|4TYN+dDls95SK9XglQ)
zEV4_BCaZ>8`+QA4-aWGf-$RFN9F9@vaAf+)JYH7Ud*%I_ODL0!Vl~lY0L2exh}Jrv
zFQ_ETzgoMx@0hUBDf3bUId96r;gHo1<QvmCe41V3IGppm)^ut3&*Il7M;kWZq7(Q`
z0e%jF^S$iCt#QKbC_^LAYniez8h_FO9_;No#z^f7&G24)Z7(1RrMX20KM?@zESrOF
z-~og=rT+tH?6>m(A?uL}8clChkNQpCdtk~v<jSB@wDHnT-$KV8PuJ6^^l?_bMh^yq
zFiQPqj*;ToZ~P5~A);rhvd8^xy#^l=HP?NPk5z+4dvuF@k!^L4R2eEV9)fqeYRb!R
zNbsH>m9zH$7zg4wa4I+X)9@oJgYOWHb$<OpvLMo(F8ly`RkT;5!gugZjk;9U6>f2R
zHS7FXh{>1<LZI)xp9@&_K>#v0v&Atej_1`+qz#NyDH)(7Nx0bZb(vqhg>@S)biThy
zoQl=yA&K?tpy_-GpgR}E*&lkj>V#qlb}CZ0Z|RG3h_B7u^Bm4=6gB;co+;;@GCr%&
zG8AQB{3lfZxm%7`tFy*2d@ygfdnn<*dHH;Vk(n)%vd<WyFd1eIT^scZ<3qn4!iZ5c
zKsM-3R?)b!z!s`zGT81Y;|1ES0u_jq&cuh3RN6tKpZ(O1jc622f@y1cCC?Kgk)p(G
z99VU!aQgpm9wY|9zYDu>zT-HF8d+u?rQ0r`+f(<MqY7)@cpx|g`n$mT%)?f9gp~+|
z?aEot24UNpg}w}bf)6JAlRQ{#AlOaG^JKoS!*QTNhunf5av6&H{-{(pYRb&PncBRp
z$IEXCYjM_PeM1!<_6i~T&gLfRej@gqt|$(9I<HSx*pEP;&s|osBsW24U#$I%Szq1W
zg5Noa7=j#*7Ip_uW~On&5Xe_=Dcj4vlS`k29R_@yO6s&)IGM)_JB_ut>HCk;E-?43
z@1y=qq7}b@UyO9Dw}R=s9F5IqXw{|yM;Q0h6*{9LVzyyTjG>EbRqNkxy|W)RIqsh>
zX-02WRImS0ziXk8k<=p^iy;AKr<n)npfO-9?P4U!(6u{J{APdWgE?Ek)78^K0IgaP
zcQ3f;Sd(Fsqt8U7rP^75&om!uf&R(G#$xY(-@bWm2V{!m<kuz`Zi5O+P?Xrq&hI85
zet(Dd)>V=)w?9j&`LG2V`OiLv-$*`6Bn8OK@Lj-y{0inM8VF(NxZSdJAB!#QDhmG!
z7dwfkXk>tGlaslV$8ll4U5W_g`OMJECJT1u1?qW*5l@{zFF%h8{Sp#&Ir$b6Az!D3
zum^3Oo2j(o?8^GkTbfU0$XWd}I#_I}DN+0N%b6QO3L0L!q<WtEPwnR|Q(Olaw+0$j
z6PxGs>2If|I%clM0aW_X(_-edC)yO|DL5Y(qhj(&%yO3Sdzi`<OY_NQ9HOUvtUg<M
z>7qxGGj%9vw#A-&461tKX{&LsXC(SE!=ebdyy+g2z{m7-*lm_4<4IW`-?;X6h?~8K
z>M}5(V+CLAo<o?a5}din1Ec0;gF6>dJVbi_nA_0PD381A_mkK4qOu9e%v6{;v)gRp
z!F>FqSr0n?@A>WWSzm+v(%B`Pn16(>xf!6vMw+7Am!DhB%a=KygkuOqdcw&eT$qMR
z4}y-P;lyv$p0%*R9zf^>AIkq2#zD6SrddbG7rf_CL=~<M`W7ea_7jv`zfKKIb#&Za
z{o0uy-T0!(roXF3{YOavcQo?f!#Y6#vQ#kb&^m_7hg16~H2t_oT{7OzJpBQoPn-Qb
z=+9Nh==pwo_0L+l^(YKC3#lt5+ZQT^zpq1&cUJSZbsZor1f;r-9Rn0d&T8Lgbgbgr
z(7nnyj@j9wf~yO6AOza-S^Oy<vKv_CIhNu&mOOF3P5=VYTwZ@$Thv&vunw#84*-BW
z*f65FF%w%AAD~~)L%{}N=O_c`Ls@fEQ;|<plD?eru98*uNxlVGr;ClA_O!t1+TBF`
zDDO_f;Omh}%#*SALktk{cwu-Ew}0va%nawho|uN`f+fO7F43-|EPP;3FMwgN>H+@l
zAC~78ruI?kOa&l!6=?r(_4i8}s$W!tCr$d^e2>iUMu!+$X$STqgSdo>s06l<-+!D0
zyWKbT-erIahyn@W2eHDk94!5$BdnjzE@_=tAaRcXm{?gV({`F=rI1pp-sDF1r7}eK
zbwKFH@HcT|we~XuiX_NQeJbH8({b0mDi28uqiAXA_4-A6X#s%G@hqVSD&^<?EuDr>
z4JTraLkv)JQCIPT5d7NtJfH}G?sNro=o2S5dYt?5snr0vVis6TI`4@_O=&#yc{_+I
zte0RbnhFm%C#g>`cA4GAi;otJqZbfxBm(@mQX$imjK=gF?kzbYU-!rLNc6wZJ}?n*
zIeRH)Q~BIBZ+kxGPGEPUh)4Uw`_g6-aL6T`-#x+IF3tUYZqur~;g@;&%7C1yz^1c(
zyc4RSW2@<M!_imE8Y~~*QJqqUK-hX9Ov6<%?Gz@?MRo%S`RKF$#cc;U+qrkveDH#?
z5;HqBh_3Hw$1>gMAQihY8}-&%sfW{d`bnwcFJbwAzsIF$RC*O`hAck7EAUYTQiIQu
z?|y&8-ay?@L>NPL^Be4l)n)quls{t^=0v`4)!h(lkc^#c5(kc}OrtL~oS~ID`tCVK
zPs=={QWg1|Cd$io-1F@j<uZ@}4=x=B-gs#@)lT4`S1O)(bC?N<M4#;YA=8(aJKUdQ
z*gChdoBv8Mg@zhy-So+vOyX%quT-2H8y!7r@Ut`EG54<L%x1rC^mo|c^N?u{UZ}F^
zw9i;9R~0^xsW2b;3K2Rlv%GeBd(!5`R>x_0Qu<<hbDrb=+W^N=If#p6yoU<9+W&l`
ziMlh`2_*{}0IqKtMRgry)c$^y>P<noeUF$wdSE{y-#aX0mxHTkm^Jo>f)W;Xue0HE
zZZS*A5XpeA;!BYRIojqnyr8k*%yFAk9Du$$Sb9d--08|4g9c|m`N;lLaF8l;>)&Ub
zSW1e7a8^LTlS4z6b`GAx$VK#3SgLNJdV(is3eX?IDhY`7K_FF98qJEc@JmLXt%Aui
z7YCDS#wna@8zWz@T+vXO0*ts3{>&h8zE~tb5^b5b`4ScKN#5E&A|$4|h#v?<lMoZQ
zE+0&0BJqqyMj7wa$NFt)Ih(NO!moXpWHOk45+wV`mjtjl0jKQ5^9jFbLEWfW6scW6
zvqy4&cIzMqHBa=4WSKMI!??aTeEAeNsjge(!Z+=KUFL+v4~zB)VoTul{p|(x^I>`A
zHSulKu?FXvc8mg(21-s2IjdzVFiD_7BFKhls~JH<AGCR{j1=w^eDH$WRLtFy-`O4X
z?lE(ypQET5$)=~D8j6KBT2l<@<8_WnYXz!-WXiW|8~{Y4&8Mgd=sUty^)q)dab2Nt
z!g!9~c%1u%rojjyWVVu?l@$8C!MP8?$qJ|7uFFj~rW@kQ@$0<tDm>R2F4mw(Z~>oX
zl5kUj3om<^c79gi!9-$T5MY#-<lCaf`Mr6cSp!#eE?0Un`BBsb)B{*^c(KJbR6b-Q
z3VbKD)2JQ%J5D=s^lI8wYu90TW*-NYmsgf+B^S^kJv4lKi~Ma<^`kR><4<JbOHZ0D
z8J<AMnjpW?PU{JsJ)-Aar3gS;{+8|1<~MLrYwH6mwY=^NbmO53qvNz%q!Gy?JBB#G
z1(8`GOlmy_``WrtQG}l`cu9Sl9fq!%io6XqzIZWa_)AHzR|byr>*Z_Wlo++B!W2ID
zpH;UUg)WnRLt4C+j}PXLEy-_9oI88=>^~TjdpmxI)D+=I;Xbr+&%n>A9`CVA-s<WR
zzZI3@;5-kPf#>&~%m_QBOi%80O}J=~(-OL<Rb4^s(QFtlsa$^`Z=Q8`%A@vrNJz@q
z_W9-pwg6y^@rnhhp}uO0AOHff<BO%*Gd_MRK5~DTtJMDcx!oxD=R@c6zINqP2nOU9
z3r@4#;aY|3$7`-KhC3w<bMN5qXs}C%_q}@P<F))5#^f)r{bWt3@1}TFnt;h|DpS%X
z?Whb-tk#``o|68Fw~Au+4(U)gDv8|*_LKvgQag>Ur3w+oI)7T~uP>^%r}DxOFH!fj
zg3;!F2MZZ)n8L}-i@LL)FK)rq3$ab6LMG|}l<&rnN&pp^_leZyFnfnJ9T?i@a`x<@
zm7l-1rmyOs)EJHV8~VH@e0drklP-=l7|MwYc^^f9ik0=F^DxPJDhzgrqo}a#1!V^A
z#++040`FT`vxSO-hgabL%I9KMrUIY5H8&&@nP!xMU|VVYdfndB<mCQ#Ss51~HgX{l
zj{PqYt1GfkrfIRE*i#__AW6Qm9{l<_lKZ@CiP+Mhr0*$)m-^h%VRm+MW8@M_P@}z;
z*O%vdPKt5q(&frSGv7dTB3@9wvKD+W<-ryZQjnyj$Gy|*MCJ5OCv`N>Hf~0j){CPh
zY4>^#OuW%g!m$(h!IotX1Sk|ENGf;REhrF442SHGT6jGd5gcs0D>y?GRS3}~7ye%P
zxU%_HqQ9t<ZJNMG1OATxM`c{e7}u&@Q<tJpPT8sXTPoEM)vyN`c*ul-4o$(4`%6Dq
z&aYeh%`sN+``eMUvK0YNfMt{t4XLzWfM;ie@2RO#pRUzo?|(jLf7S)QfGD8)ylSPb
zo(__+wd%E;T$tI_WV3QDdx~N!jV0Vh4ssL_NRL|sUUf_8{Wpolbt+omy%~!=idS@v
z7&jpcT_!`@;2ElU)!52@_$j{8M^4WhZBBYusfy!grIo8pCmkqwa}$8+dB(R312ljO
zjKx7aQhB)++3$cbdv>L<Ts1#cmy3k`aHB)xN73y^K89>Q_jg@j*v6&?Z$<i61#u}^
zsR9UX4ru}8BVSN5b=bwQE0jgVO1@E9=1R0ZOjgypBYe&G48tfql;U4Gh(v<d@w)7B
zsrwh?j*%@K95l}nsmwfUxRZITY9|~klBEeLFeaP^JnbE4YSc=xrq%V<jFE9y$~65g
zWRl?-Jb<HZJhJ(fRAl&jki<x{MtVRj6Tpim1aE*RaN;JXGdkecN}o5ars#Uc1q_Zm
z3tM4(5hRGhP15p^M8{>i8pdq(oZC{>6!o`cXb5(qlHTq0uBvygsB5{#B_+I<UxAgD
zZ@*$RE3I}C>Bu<Xg5KOd`HY^a@Om+W-)$!c92#?`!{UT(o!6lL5)dQJ=!D|NO9LWT
z7k^ERD2A>42!B2Vk)qdU=As)+R6^zF<*yAV&GpE9e6d+;D7)QrPF{Qt=No%mb#sy?
zg;?d#-OvfL?!c?Hz=2r32_pZeorR+A&p*GJ*52FP^gQO%e6jG*I>`6;b0$m23gXZF
z7BeoDvGDp;<|Q4&bD16t?Bl<5rqJt<ndo0J`@4_-u%N@=IY-~Ws@592^L@^!@(A+y
z%iDh^2_o<?^~UN1AtNj=K-^oKjC^ps2Z{Lxe~lLow4^vVz^_c{1K?S;;(8JG8E;m@
z=x|MCdHEtt(M5Q(G*pTaRNkD<ZF*_|Yd!kB<;iZDq^$9ac0<nImBIh>7m2Z+^oH+h
z7Ui0Nqy_&7)gG-rQLY)ruvkk7O?QihE(~W#28KO}TKndiw+q%w#jO5{vX#D9(W_%U
z7PWd~@KNQD@^bNwHUgQW47THb+o{r1AvqvbCXBM%KjPQMKBLTAfzOMII{ZH>3vM@s
zz@G4L9Wv)47iZ6H`u<R+x`xMbzBku17S(%jY&=39L$~!idRq}V*^Q*8dD674RXPw6
z@*W7b-o4f+d=c8eyOxeh*8|*5g}>7J-o80<dSJ!3zPlDa1cM~o41m3P$?!KdF|~I(
zh;}O=!#>+hqzxA?-(6ET=s_EPx;Zkx^-?7c6h?a--%NAM&0(5%*?ogxr}?~z{-t6T
zU<wFngbt;XKqR7D#Y(}GfVc+<G3@lj<Dc9uKL!ZRH(g>WNnw`W*^A7UU;I&0wg<TW
z#8(auo&7P~R4K;L&zspw?>YdJLu4PooHu$`t-Adn%zM3RCEtlH-rv}|I%<486qvEw
z=b?Txv)9|q7BOG^egdabwWO}rEr~v#%tlaYm9=(QfR9HDx(y9FC%2_~{Tvz!*z^8L
zuF9Jk?Z2n~#e-7cN@hZdZuFcIv~+Sm^JgD>W6C!yD)^3YYv$$tK(0_MK5Bg-nlu}W
zlGxZ>e5|NKLV1W$IPi_m+{)e-Tw?4B<AM_|toX%W?je`tCkf7lRoe7-TGd`wb0#=D
z3Xi}$eeYh<XG^kwRWRCVLhd(b58$S%=vP>u?Qx~df35|7e>hjr58Y=NA?RI67%H8E
zB1k#7y3>{z=PrKip|r^!&Am-Ev_qsQOVRv(G^#E|qoh(l9=Z4aLm>;*K{}<t&&xgJ
zxOXQEBi~zP2=lt1Mf@Bd4U-VQ4nv2T)5{Flw@fO_i_A06$_qqYp9|ldu04%n7cnku
zeEqt038l<<TUl07PL|LE6sQ+jOgUu|j-Fu3@+9_|iisO{X}|l>TbWn2TeRv;)1v<s
zjf$m4Xd(=BL?e=#)qm_cY2eel{Nr%UQVz@l=|ELwpasdWooKK2?=~H=eu9HThk%VP
zhV$uHKuAPL5>!5$ebSk^68Jn0uP!56eCmDL89x<w{IiZo$B)S&qsOD(Ze03~<Rndr
zAiR>_2QGx4M_>zo$%o7vevz|2_&nxW&g3(v%B`P$F@6j^xj(!=F!p7%wdpJ7+@*}!
zxup-+7~1p;;QiUoXgM5RAx=vw$;|Z@t+q0M^2zKk`CndUEtD$5?WGf{-<whz<S$=}
ziCq2q;Wn}#<^J!3W?a5_8UO(}V*S&6(`i0B6jcKdcUf&s9s41t+;{e`sfjR>Sl^oG
zu}6smNd+jkLnfvKK0@A`3_ern{mQSI)%0Ii`w0d#LVj;Xk{N>QY=u3N+tj*us_1=B
zC4M^6;07uP-XnG=eTm2c4h{}MMl_*74j+t})6ztE(-nY1J9uY^ian+OuIiZ^^c5-$
zvX-URS*uJGl-Ib>QB3B{ApBe9dYFNiCsG;8%So%`>_<nj2XabU8yk1SAjy>Yk=u6q
zViPbcz$xPZui3wEf}rnR-YC~u=HEA%08#tj)L80RrDxV!*&mHrgt@E)p8ZT9$~^rG
zhNNKHy1lhgu6m9b5xxHHU*Gn>CTfIr<uHoGc%YIv;0w=Id9R#vTmLh&?Xnmh9TAvE
z(<yleWlm^l&9ZbQO`*t_r~r=e0l=uLKIM)A%M+V2>49L?dg-3fe<&aQ@5y{qjjE=G
z27E6!qN)Q%rq;sO*j5v7BZM#>Oy^mb1%`IQeO+_EqZU|%W>0r~7pf)tPxGUSx-hcC
z-l|Ufh<%Od6$dK9!NHaW(P=@9`x5R3eu*`)Anb`~2Y|v49~v-lTr&D?DYYSjI49mB
z!3;1)aZM>qfd^g3`oO-&sey8F?t6}L%_t7clLznKzAntwi&uawLHD5xdRDJr8{yCh
zn;Y-VMd7bW5UD>e48P#te`21WAVH)=436B#vv2Gvp%@hJcgR`BP+3x>(g6L2jnZqb
z3^e_e(0OJ;WB2kGR8_qo(B*>w6d0t0KUh9QiBFoZe!jIGmel22?^Ja<X{w;hOsNKb
zowzR;6uos^f9dP5LS?4;psuAH(X(RJ7N|ah(1H~QsaJK?%Hge6zBO-bdr@NZAv6_y
zv^vM!Z=k{3Z$F*w>{mB9WCS618T{Q`e0Wy`-H3u$IYD8z(qViL4qITWtX|!;H%A-E
zDk8k$Ap}+XvSY=8zQ2{1Za3Y~eX5d3(U!`3*j?kJO1|G<eez9C&2E-Gk)a5m3QwOl
zJO1n|gnwmV4n>KWmkQ;-Yg7;9P$<$|INs_3hS^^57u3`9GXZB;!t0%<!{D+;<5_I_
z=pzp?u56y??0`p$OM9w8USPI4&D~+L7rry}33J*A?Kam?-kSt#5dG@mIIPIVi=Zc;
zXZG^oAMsh3sy$>soHvZRQ5(4jF24>81?EkenX9ZoCrRg8BLl+1ZsPgy?Xil0LU`Hv
zvHI6HGQaJI;C^t1C)c@Zr8!HR4FDW?b_@UGo5(k{{~nJu_*dV3IuwTjA7mv>5M&E+
zuUrx=x|>z-Bu35mFBe=t*0c8(zq$0{WA`Zsv$~9?7N*ymo<RJ$=B;m@fac|~vFMGt
z5W}cv+r8CiqyMbUj{JhvOoI8B)VnV1FdAo3xoGyj*p0SJAqK(e2)sz%3F;PHib=PP
zpO|@Un4B(x43k$h#bY2Uxr79hQ?|ov?~3mQVr|&(Q2_<(8vqddwsh7hTd!onJ@Lhj
zn1Is;=vb2k?ve4lw+VO}7`f+a5nq?G6D|RHzx0_w5t9Q=5{gfpqljGI$!~<yEhtkD
zq9+6!+?0z8kZZIC^JFNL-OC*e$%|D@FCqx}eV&wQ_|x6P`J{7u2*nZc>m}Sa-^+dY
z?&-X<3_m?V%UrkvA!UAuUDhRvfj;Z@EY)S55L<$pSM%^R1soS*;9t$*ey$2J-onEa
z9493Ob~d@|R@Y(-_i;HCMu6&$j_>^oxpw?`NrqixI4Mk0%PO4-<A%9Q7wesqC!WO+
zkLz(z6)-p{8?Q0&dtqTgVtYYC^IYpo)al@(w@nxUEzY*^fnK=%5<>uR_Ddv|GQ7hC
zN$rSyYvsb7wBxEXv=Ch<Zx!@Rb6*k*kK{8#bON9`9>H^gS-lA4epJNe;a`yk|JjuG
zM>jNWhPLSmLHazB*sEIsdpZMj2G0|FmgL+gsaMhm$$Ii0meXpcpjCaT<z!Mvpj@X(
z`7CDt)D<|O?0%K&PUm|_pgLFh)D31gcX>JF5}(h`>F+E7UiIUGODEQ@ns!y+OLnuI
zb!UYGn2v{Kv!4sT@+%^mwN=fj(sYQ#jH{?!qXq4QH$E}M|N5@%XMU|i88-Y^$dJsq
z3n%>SNZ@PwTk|A!&7OjztK2Zovmi(N9UiDSV{!+|oh3yxRf1lbQLb>%3=>0flRiMv
zOc+4LMZYlIXVVGs8>@6ARIWm37$r=%b-wEf@)_N)zceiFWQ2=@9(~+P3mC3P|1D6@
z`OLuoSzqZnU48bZ+rWqUkS3V3_Gr~W+Vk#|o;M4>Tk-q)*Q@&OV2yeyjN&>xd&vFY
z75x#zFQRPl?3tr@P6{vRuvo9hrgzNZeDqq5VOdYNUdxLYo3KC~=8mt=L=yJ;sZ8m~
zWR3@*R-0Pzdyqd|a2z*URejV)J)Tj*%sboZHLoVP+kryzD&GkjDQFpkjm?drTT6O{
zuw3z9e9Ga&9DR*XN%#6H3hpB)h3d@5U2%^N`|@)U-gRQY#CcB-y7L65)$rOuKw<7i
z{h(d<_>Hc1vC`GY-x=5*N&d<r)A-Hjvv7ID;|TzK88c+r94ouZfWe7)3=>E4t`&^^
zlxJh6k>~=N6DF3|B3~HZ1dYH1ShEFJF8TS;bP)}bn9BmJNxyLRd{~Bq8C!O7aGykJ
zXU=fbh7e%u`f;?WpOQB;*p&K_A9GwbH%@r@=i0Kjl|2dm+(+ZVSP3NBDWjOjz2Ki5
zCh5laun2w{^=_OZTT8-s%C9sbp8URjQw06_bRCBTIqGCf?Vw7P3Wt=?(d<sK7WQQB
zxz)1F3Lqm6sP@J2`evtmjWa5y)?lxoIBZmhL9ubmKUXRavv}tsS1SJ0?TrhF2o4K~
zsy7tlHiB0efPyKuka1$RG(PILG1;?O0%CgZg0d=KgspCVjhrOH^v76Gic8=fMW~Ei
zLL(rwCBy-v_XwBhn>hy@h#l@HF3jW`Uapq=*FN!AuMDMlJiI5rf;wXvfjPhrpvi}G
zN+%U#6-pjCG0I+wQAzLY6NT9Ty-_vk;&tHz>Kj*rgn;hESD%ekjKM#y4P$eRyg;_(
zYdyj|OZ43dy|8)wb+Q`d!QeC2`Ry8EuwtTV>Nx3{Asl=Ey*@oNmI^77^}4mZbA#G+
ztX5YgdO8Zq)wmRg47xqS4S1<1Q)mVX%%`UnX7y_5=9FZ~i*IypQuDki8F*gRGF<km
zf)7n@pqp05WEfYQjlBsO)6oFdMgm>HWK1A{)O5KiI^I?N2SyWH-2Hu=Ncw)|O9M}P
zf`sk33Bv7x8ub%nM|eNiuBMSOi0l$cl9OUZ<N8!}>(fCRP-Hhh)-o)Ao4L4OPLd{N
ziSneUAwq0qH2tN;kW%d`xA5!Z%=c*el-DiLV4wAb&fKaH2lZfoUu`o4oN`&H^J9Z&
z{Fooi_sa9bDLpc14hd^qX&k}n-tNB?gX(-u*vmR?XMx3=)9x{}G@KG1%sSH**eFv7
zU76!m@9bPv8t`@q09r)hQR$o>!T-a;(#rDT>M{E9Mr(UV?zEcf!&?2FEcaL+aRwZD
zcA9N^>Iu_v3Esi|gjtf9C5|LaLtq{<vc<J}YWBBzGL8XuwqB_9ag%OKXFoJa!5n-b
zk$}%eX&_CxdTc^?wF(VtgoYj^E97zgKL$JdphImllGcb)avnM>H*^PuY=r&2b{0#M
zbbiBf!_hF_(N~#n#GJ#y_F*wFA~~N}{Gnkin#U4$EGZ#!G9|O&>E<5lRc_okD>y7e
zb9Zge9OnpJ8Hd-LpH}9c@5KOc{qXbb$IA+ghQkd&EF7acQw7_7dU=}Fu#xQg$36|i
zh{d0)FE)5Dw&P<i^!d{;0HCCYEna^h1eqN75R-fBv&UqCg}ZZWz_;aW^>mV1zcjvG
z=VXBS4nP<otyonuZV5w<wE5YaUy&wPo{Xq_TQRQNW(Rl?mIu4t?=y8AraW}es6qv!
z-pWM-f_HgMSB<6WRApWte17@byaTwB=gyGadkcRj{1)l_%@q3E{+S}?k4A%bBix@+
zIC{AZKduZnc9!Fh$HO<nl93Gb9+}(~4hlO3)1glE)rj%MDnN{mFG5fdyQf^;)bZmK
zOv3>Qu&Z!j<v?Pd2LZbST|gHbd002WJY*Fd98BS6bUsZ=IFAY9+Q(th0FGA~#E0IW
z_-fBy(qoPX&6bY6T<=_&SVP<=p3eGFVv(RQ|BqlD^-&ST9YFH4J0f+dEUWaoK{_OR
zfKdYi2ToZ_n%+-d8a*;kMGWa{w5wO(sR(mRm0sY2O;cxQzwh(4ufq~;Z#6S|2d0z=
zE`5hztS1?c$@d+;Y#0YXaf5NFT&=-~dDb!1UVp42rjP&phIQ);!#dt*xSBeKj;1R<
zF6-idzbVRZ33B#hTecmV)M8OB68K2cK%32>N$7o|+NK8`q9cIusKAMm;yrN-K6QVw
zC479b?Cqo-Vy`A=5Fly(s44C*fM#bFg(Q3TJ@T?rB;@vbIbc!BvVF`ac2@EB!>c^{
z-y*tB@Y+&%7H<t6fl?0P&C4vuB-rupsEuebh5htqH#x;ngSO>G5WS$`L5ClGaL~*c
zdx#t2@7>}pm+WlM$f+?$gz+Z#YqJwArNiRDYlE`9c=si~Q^C2rm%!SA#_lBF7{c5i
zAycqKOo1efrVGlmmJ`q9O=Z4yJ8EQ8alkTbbr5c$>X7$RA&$c#H9m;Nd*0il#5uX!
zvtNg4tG6@$=znpxYP`zy^hCBIh(u2itoluHufVwPdQ_`g*dSKs$9Z_lJ^>w`6CkKy
z=e#4Kr-poZ1R}zbh_@0Zm_@i<n__@4*Q15n`O2N4zWKDud)h<K?BmtmTTbfjXoJ&;
z0moFzy2~HIXDdR_@rA)pP(hl_{FbM!qeDmLajlm=<fASpux-#@YIDfD9okrVQl59k
zl->DPYoAu5K-~*HVD_P2-<Cv(4z34$;1JYoI&13%Lt~LT92!`;vMO;*{$<5sO}I$9
z95R2o_%`0@s9y2ickhqnY*io3I6+b=iL`6^v5w-Ss`qH2wUQ6=)O)gm_K_-?+02=Z
zU;Qfe)@9tXS(AjGmG?(N3>yBVE35p4nvbajl2JVbB+!mJ+Ly43UI&O*@pUFmhH~|I
z@a0-1w#V0?qFSRtffMlk3c4T!-EPs{)D%zYUr_H^`Z<-vn;`sgl~DxP0^Tqe4aVZw
zDHU(P%XuUuT*z2vNZ!W3ULx|RzP!jiMpvM>>&IM-@U#5565nQPZO>@U>LXC!Va`s=
zoYtJvq3GQbn;VO18+cS!eo2m~pjVye&--rkB0Q2fdh}OiFqZ-(U~g(W?h!g^3cdV3
zThrY5<<V(1GPam;^5>1H)wW^x@AdWsJ<mFO`xh?ipIOnNF*`5-UQ0w%H>qHpH(~wS
z_{uvt_8ZbPqIfeENy9zLmZe>Y`GiDuH-=to5d<6$^gFkK%6eAzg2>m2Uz+aM@d2Hv
z`i#K~x1Iz=#}KUwi<oOrv0VIvkKOH{@B{iwqEr&aVRx1rOW_mzim(+m2l1Sp=@*Bi
z(;DTKCKwQ<|CAZ!WP+9wSQ?zGY@>hsj_G{W=+RMm>tjc(W=o=%*E?Qp_E+b-z8P(4
z`uyLJd#cG;XF_?&^G}z`?c7jao>_N;xz8K~O^tkR!@Dz68Gev!oM1cf!OS)m5|3}&
zS~SMK*3<lvh$FkyeA7E@L9GuGxBuCa1TRTX&mQu8bHs+8Ko8|X#rw;Kj6QH~PsHNi
zBzb^br~lAZ-rBsvL8<30Ey~gu#MFLm?jsG2+ogXZ^{<ZbnI~xsHZ_fObo|-={o<eI
zcFy{C<o4VdiPTg2N<)_rXh5t$0KX31wH<^FT_>pL&PUNz)*ibR^c(~DV^L_DaLHu)
z2=Q;JwxBuGVq;tl-w_+lkN$TGd+`pp1o)cnyVgIdHuC(v&+2dOudmfa?ciJwN>L0@
z8uUj1(X?=gW`EnDUR<in!RJ3?Cw0dG26k>AEUAq6)XQ7$@ah#%TW;3qsLbmH?tVf8
zK;vZ#Q{z)!qIAhu!an||bLzX6<zvE+Vas}Gk~H8))34AkohUqKaL@c)pq>myzEru+
zbs-uauOWk+)FjYuj2)aB^$@fmwcZf`5}vFgG0s!Z2OJyOvNBYhb9sP%ZXc4KwwtvC
z73dD>?{G^{Ab2wdfmqao5NkCaYeL_l7GJERnb@pirzzxaDg0#XR*w*$DahrmcpW{a
z63*jXJH;`Lg;!@@P1CGDQHH<&iv>EmP{WrrYb+r~onzCcIKPqXA<oY~@d4S4Yi72Q
ziybmeBNOl%-HHUI4e3o36}>{3fF=mgXyOY&Y$$uYklF1-Evgy%U{&j)S<y(IrPrv)
z+7w>9A$*OSHynhx<JRcH{n2QS>ao~&dk(&7`pLL(v|%s<$2)8{)s10KPOo5`-1ESj
zruZ^@E<=HN@>lnTIhLcK@pz?eKLYbI0R!q4b9W>pK<6{x%#F0ItiXfm=Pgkmh7>Fn
z+AR~lKU}`;R&ZD3O%NCKp`G|zd|c()po8ZVmxBPT{Cph|54D3@WR8yrd$*Z6J`uRB
zaCL6&^U9R>i+{zz*W^k=<VRaJ7Y{yQ)-;!EA&kM%=Pyz9+;AS&^L{L!0^PPf0410z
z-qGmugAD$v`Y-;hALZjtw&`9`e4qx4d+dQ0$H`$AjOAP&)tk88Inmph0<oJ_s%qN^
zq}cApyhCu1H;syT37;-WnRI}G-z0?1-|KO3y_?cBa%uI-^4A?eEXGO3Gf$Yb7>Tb=
z^7!5a=TK{J);o-ZJ$;z1y76~w#Ku<y%8T$h*cIs5_hYT{b&@`v*)I}xIC6R8vf(IE
zfrRjWKHisqu6kO#J7MMQ>n9OEQ%dcd3~j>s5s{lKA788wRBr^WUfF($lERoEBqr{2
zko1N1j-I)Wj=7P|n2nqldbz{W4JIf%Ms`hRua`kMXq(VlzSM|K!kp89vlOnIi>opJ
z7JezqPk7)(kwiFNuLD;cj5wb_w+tEdtLX2Hr`$DA9cFvqeuttdCIpgq*Zvr&;DOhD
zP{(iH%UM~j@Pltz*H3y?H8sVY|4C%*fcYRzi0|Jn4l<`|vY%jffEWBr0_wP5@~lUX
zOnkZYLJ?f(8><`1O2y?5D46=a`nu>=_l;3~I%g$(7vJ|z_vfvlYoB@QYVX_CC|Jqu
zF4WvDo3P6eO;^`+?9Gn@XD7`rCBq9P64Nd#RJDKtSHsYameRovx735+Qd3e5KT>Q0
zq8g!0ND8rXXzwb=?#@^hPsF|8nkZx5L0AxMcBfaSwy_-L*%Msr<mhQ3zwcUnTfl|B
z!*HkVe*^B>v8^oK#SgFZM_U_4H<WvYUxGpl?Zd(FlTbg%2%ujPdl@nR#Aa{x?*s|F
zTLO;5l(c-fUDrzH*^|0TGwoG}+Dm1>mfEYe+Z@{yhbzk8uG_KW*6+K44B>5QDr3QW
zNm`7rNcQxVtr^iMG6{~_Q-$66(+?qyK$GG104=MLhZY#1A!h}?4ZcPl$+K?SG`@`q
zC6EtUY4N9e>P>wS>XKDUYA!E7eEIC|Fh0`vp+6F1|DyK@#loLDG2+_hzqMte=CzH@
z&CQnj=oaTox@xbU)lY{KttO<=?U@2V<qE@go-`Z`Qb}DNltf0uT)-ZE@^QpT^1{~D
znn8_wq6}Oi0{w7uk~PJ^i`;~jVx+9vvwoMWZbBT?QnMTLfDDoj#eQd=s5@M&BqLYo
zlHfvKgC9ga_#B5q7Z>SeLa=uZw9TN1J(^G17}>`c8N6btCQ~>Fb2x3B%l@~0+58p?
z4^B#bPMT4Wmhl0{zg{F`&25Mo<1KhW>SSf@pe(hj_O*NVCpQKDGUj`-&rW>hY`8(b
zT2=k~Hk^|m8i7iO4Q@~FogVIpG=q^B4e=(hOkzx}*oZ+<>Mkf^s1{G@U=LE^`>^@U
z+82FqVNX~Mdfe3!M9Megb=zZwZ)k|OO|o{pyPgWNxTxBD>A~jU)6vS=62#m2Kb_;Y
z36J-N)3DUK@+2b68a!|SaUJ=FpsaIm;)BFd(*5%6B2x@vPC^;s?(mVTXAZKDgS#W&
zXdC0iUmgy==~g#-b@O>vZ+Uq7z0cwG1+Z(T%05ubKjhcY6M1K<v`pmS2bc4pWH+j`
z^3jj)XY{NQNOA&;6Z9KJ>h8jcQ!-2#iU9rkX7r2Ud5`0zt}8z~YGpoZ!UJot-d_li
z#xy87`L&30oM=y0jnBJG-aO>E=g3S^Wl}9=^V`~8yC1*)3|baG(Vb}S>TZ0b#m))q
z*Qu}H<8=Du7imh!)6IvWcLsL8PISKcNqx6W->2b*dGXS=C$nD5R(JZUA61{8y~%2v
z2<XF(T+(ETQj`d6*auVEZ($CYfG{*`>noqJ*10#;=ZM=slRnl`CXx%8D{c43TH;aq
zel2)pd*_I?-$+V2v)r#KTzBJJ1-uY=Dn={ONFtUp*CCD%G5?JG4%7SPYUWNM0qYC>
z!%zIYtycbZB$bq{#@l}k-e_BlyeK?TTl_K;1xxsR+`h3qrR_=G#GF2_oE2wKLu+55
zwxB3_canqaF6U&|;rdkx?uI)GrFdCa6Inu@?Ti<ieZzybE@5k`ivKo8dKF5?|Kl|~
ztDhOX=DPR61DI2TFYbK0GJ4AFj);}B8iXm`F9W;Ia)rR^hT+d123RShc0>nca-|K|
z72wA7px0~daUO4}k2|T~w_Dm>rXU!WU}67%t1NOSdE)W#eANJJw?b^|tgL4y9)T=G
zP$jRgzW%lo?d>uf3#E~LuzEa{yXA#}H}tMbTY6AIJqc<FUz2><g&oJ__qDokdGMZi
zbJZ|fqH1r%p{8eBEBTupiys<Bwr8&p!{+ULq+g*-+7Zg&#Z(c@#oQZkQ3oIHd-dqM
ztN8_zD(ne(msR4iGXYOwxE;UNed@gU7B4|4w7}>(Waov8Y5@)PUTqI(sS|w;x?4I{
zNq>Fm6LBYcCw(-Y`92Qd2Tj1tRAQfZZ5E_wjMGyXdRlL;2$=lt>F{{&{5s$t%q*OV
z%q~K3xBpJqASOT(Jr<uK<C$@q(}*%2(Fvw_kVwCKn~^jbhpj1@bN|-zw<u@N%Ns@$
z|C8{vbvfrcGBy?ub(b%WMhNk0<M8iS`-_Te_faoaiK12fWs_-1C|bzVvP69<M-Q89
z+2~lMMl}K1$E<zwgsZd?AoI*5R^V1q$aMi8@xPJLjes=B$S-+Sqe|e$D(3<TnvkX*
z$3Eqbj*P&Y)SN@JD@zsUBCQ~Uygn1dU)NU0qC((~Tt9)r)6{tSnxL%AD@*|}B%Hha
z<#{HPp=Gx$tvUMNAKz?%0Y_#1zj@n{CDTLXI(1UW9NzB+O#A)2B}_P0Twc}^WMT+!
z>Fg#5))Pg^N+{*NpYUn++qZ5MJo0#3?{E0~wXc=%BHM(aKsP`D#*UMa5id9B66zR;
zM!b#KDVB(Kg5Qdj-8GM9)q8Ho+mG$EQ0fGQ2zxwfAD#V*E%TBJDph(wBt{v=ST4e4
z=av4VW#YN2Y4{IzO|aPdRci@Al>zw10thzK_;?LwMzi3C_!VN_BaLc!ltZv~jqsB0
z^H$>u07iZmAf@W*xu{J6^S@h*U;k`r2psZGqTKOzaJ34&ID8ut#Qbcm#Xp@J8Xg`l
zP=A_}MAWRj<vfGpPk6cK;1nwNjC!`=+Ddgr`LpLOhX+ID8>3lFX5gN<eHhj#!7O8k
z#4=H>#|~RjC(U#xZ@_KG(ntTPbdVkwsA7QbPd<Bcml1849t~YTv5B_1DhGSYB<o3y
z%G23i4}8iyTe60Ru6fzlD<{45fBs4qpV;4<D*&G5ddo^NYkR!=&c0axx6v>bvjIB@
zJ54%Q!dnjZ@bit-FRU+N`V!;gb2GXH_WqYPaV021+geglPGOvJ`Ke5P)Z$ur(>3ns
z*cdhT<Lp51-2d=4wU~&lpN$&iZl?vV)3DC(fr`@=E7L#L*ORJSoz!_c;bMxhMiw(B
zD%~XP+TygATq*SJT5W*&Iak-SXZ<?^u9_chY8|?CxM_rsf1Gg>mibKKe5*V_2HM;r
zW4WaKTR0zIS74s-*IfJlRPQyfmH?$lc&lV5PPSsbWAwujLW=80m-E~(qt~<d`eqv&
zTU*O(zRgJ(5=X}b5NH}q(o^*_m!-$5%JRJtcwO^`GWTC0^ColYo`_j$62opf@&o5=
z%d^I3t$}PmZ`r=e;Enj<<+bm*XR3Brs<%j>fmty|o^3Yo`Jf=;u(-XU7@U(d&6^6#
z;5pn97GJzIwG|o@qo2zhG;`4HRY%z1YB-FEOmB)3{T*kI5bYS`PBzQfjC6g%K~aS#
z@}5M>9iu-!R6g4{5NNx5pu<bsYH4I61biQTDrWg~%(CHj=k|j4_MfrQ5rcod=Qe{;
z|3}ezI70pZas24wlyamTi90JwID3yXicq$bEy>Ex9_d1y5ocs(m-)@iUO6E<rwG-h
zOK0!x_x|-4#GTLQ{eHck&&T6BB{TwAT!7A}Q3cg5hOG-U5ooO>)fX^2wu$-+s!2!r
zC@mnG74u6nlFzkegxv91P|5P>yxx5l2Y2|D_t|>wlFzvi`DY>WN8``V=1o6Bme}Ns
zb1wRC`v`JFVw9ymLu%$O97AYie_p~F=&;kqK%P=Wf*LJE3z(LhQgyiXom`b7#I#sx
z02esstP`vdj+x1I(7+m8(aevplI?-B3k3?C9F3eT0g6+2`0=0AMbN^2P#ACP_C7-m
zF(1C?dAP0=;rZTD+V5J(0o`kDJs6KJLMsDP6va%0s3G6Bn^$jG)*bJy{dcvvB8wf0
z6xG2ykAIcuPK&!t7S^y04G0NYYC#WfwthkvqlxfmyIc;RVOI(^=86|@LB<d_pf_V_
zX(bsqLgZZ?<DUZOpJJXqh);p<>`okjLkEX%?uPFssvNBqPUae@y@tFJEJVEo+k-#@
z9#_Aar#h}gEXFsG%>hc9c22fjtNFCS<39`(OJ1D#_h>DEje~{+i3*kLz`T@m>J76c
zh=E&n<MNE^c-5;)7aeHb37TV{9|4ZiNSs}=_BA=i=hAdMcVPAe<_PBQ;iiX8dJ_K#
zD1fqLVbY2*%yo+Ny7Jsi2kV;SSK46mkKsJQ5$uLRT4}^R?J|fa0f1BgreZhHV+bas
zi0FuFEQkESd#P1Q^?X&2s52Fdvb{9|rTDg4+)6*s-Mu_JWEXC1anBGH!yMs~IiLvs
zAlW0~mlX9U_I)?zJVYBrb35+qfInNHE?V{vrf?2wn1Fw`uAF~`hIvhVDW5!G>)5jp
zI-a-(Q0N{2sCm<&LZTfAYy^QB<1;f$?ZKd^a`^E<s??vA69XZK3WdB8Qj_j)KkTJ}
zc%b1TpFG4b;AtQgK9T1!?8RTOGI}g(*^Z13{{??%r|^8#_M#PhnOhg#<Eb2FE7Fiv
zQs$&buF!;iv;|FP)`DTpTCBd_I+#`WG|Ko7xdr+KYWn0-EMKeAWKMVOC5l$rgd8u3
z#ul8+bR5$|-qz||yw(*14+&6|y?TzgH98;GN{vb|;DJ&qK#opN$+bBQ2EpL;`*@vz
zFNWkuZtKH|qzw)|HA<cQ!qi7rySn_m4_B2|KADMxb{y^~P#D!mT^3pkaf;zPtra(P
z_FbB2AlmPg!#?cqk!QApOicM9=*cs%$W81qc>Nt89R!8DFi}wE3e`7y2~ttN_Wpg_
zrkaVyDxRP7KMoB4dY4k@4>n-)N{l^4sRMCtO_@@)e-^d@<Ji5$8B}kdbMiYbl~=c)
zEY-^Nh@EVz94BrrYQT>I{9V;;v+oMUzqgDjD-)5noinbzp|Z4E3T!4n1D3eBxa0i^
z6BPq!ezC|P>F~{OY&?U_23in|r+PKm|5ut;TX&Q>O^V@luEJi`lgz&v58V(Y7#148
zG-%HlVRs^$3rTf)a+E$C2X&KSG0cP*!EsDdiuV59$;R3B|3u5|l^;N0@zO8|f>&!4
z;a>R?^4Ou||8^^5pFiMHBY(C2;#c)bt2E7luUR9?3b`YtWcEu>75zQJLr%dZ?~hE!
z@$&Bck=NtH0>NOp;*Z?;QD;hPy@&?Pu$c*>itfdM;145+MVt|k{0QMar%e3Ylk$@o
ziAcFEHJh7j{^GQ3gE2RuA_?KDBHH}xx}PIn?3fXAFSGK?`NQ;iJ^P71)-%sgdoY?V
zD9>N@H58)qb_^1N-T9)yz-+Hu)4F9LVC}#Nakzn?AC+5`8+5occg>L0`?OYQVB@FP
z={osZG0{c+aSv$QXAfT|ei}3r-LU+$BV4Mj{mrj(GKzuA-dr@Dx*$JNJJd@$MB>bK
zC)PcKG?LJ-Ug8avw*d1^6VzHSU_N0(<z2{@F*J6Tzd13VWO)e~4W1Yjt&W@)pR7te
zSyBflMxj+JNPS(x!27956|N34f0qA2Z&IgC^;shB$O+X<%W<E1X#C?`^nLU4xsJoN
zfR4joY#sn&wND9OXRCd*?b+BUA~sNi59Y2L*U{0@%uSD^AdEovg#+1%H(dQ3))5>M
zg70Fh8mOs=c;R!A_7h^HMsdXZ6$8a>e!FjCOzs{$S1R6r*y0R)>EH@&hBzR*{Ae>C
zR~SK%F-777`@YJ#E$yonz<Gq=VeE{GuoE1NglF6Tf_Htq9fL|L(*1Lk!uciwCRhxI
zGQ9rkcda{P^Si}8A#r^p63BlSd_KF7p3bI;#r;J^Lh^Tf6_@Uzf0g9!!|Re$2o~Cv
zeqh;K)MvK@I49QDR#p`JwVghdll8>J!)qA6k<F6am%?n3PJvGYz537k69XtM?Kd@~
z^F}iFf6X2O(y&bEylZ8pkA*9xjdNj?30!)4Ak1JXtNR&{3qnHuLT4q1iX;UdFGX*j
zB)LoD4Z*%4Ln2a4+~Rcac=xd9XTXY@hT$S^)-GE@0QnJ05BqP|+LgCdC=!O+P}T*!
z<o2VZIPHNj1RUzvfBjuWJ;@nr;Cs>REaa8#2a6$_G*RzYgI@piDebFq;x@x1kVwqt
zEiB3m`f&_yL70KLwhmrghef+kET?%6^S|h|*+!4k6OU8pwwyBv0taGbb#>l<S;^@O
zLT2Yi)LQ~$7o|zi+gfIOa`G}6Yc&!QgWuTvny4-cZTTwfWWt;3GLs+?+y9xPuzn@_
z=gNxy??{8E@|^WGB3f5l$HwWaOWv#GwQT~l^~#F!-sn>$;)mBGFFp~S-?I0@fJ0um
zF#Fp7=gc2exHL>9^&}F(SK$8lCx2j_?cABz<NYj|tnFq&Z^kZif>C7M*l^$BVKmdK
zN0hZ;tjx?dB`-8Au*py7cx&RG&b#fPi7yNBO|vS>Bu6#)^fFhfv3KUhJ_ilYCTn@u
zIThtkUcW7cGrv7*vKV3W8-AX-B->EBIXg>ka?fF^byRvFLaT)}u~rJVHlE~5ZIJeB
z(<HVZZN&YxgnBK}<Lq)B`+Ulabj5IXh!W=-BEjS=Ql67(f{*=@pHbzMD~tvudn+jU
zV}zlFNj;p$`GPd{GXF<;YR+?bphrtP(_9?>3<3pHS5XEX{9XFu3wDYpKcl(DDsrtt
zwP20IJI@E=ko8xvC*LS}r<37_Gho83qF-}$d$JZxG>1Hz`;JdHPft!|x2^%R#uas6
z5@^#f$OJ@)ol~ZRpMj8?w$A0yvc3yiC}iSNW$GRo5P{rhoZE-;;ZM);Tqbvn0w@+N
zpO5M@7fi{ejlM_Wj6{aX&tPDvv4x2U7FtG#s@MS>f+HkvRUa=oCxAp74-{@k``^V!
zNrh--2fVFVd@?*OeN;iTo<!uq=}DpkE(#Gd3>xOSVfi+p+a@MM38g7g76biqg+*>N
z!w+P%_t(kvLJK`$3^ltf`N<2xz~6DQ4UiIHwHB3uL>dUU%3K7Zk-EL$;#ZF)`K35&
zHrISafCN8)`~9bq-BPt0ZQwE_$WI7jt23l`xOiB^zV&kRZ^(U%+YoR`>9x1Rr|RU6
zIPv#^qf7xHTjOKJuV>(X2%(<&2wgAaq&-K=h>C*2W0Yt~Jgz;X+=m_1-IfU_59L!C
zaAxJ)DFq}7d=CtTgFSWtJ#*8NvP`T5FJVhPuNfO?t%}=def;kcB1j0Ftj@AA*nOAO
z6R2tZL~S14T=JC%^wKAs*WqeYPlpeMM=sf%LE#~iMA=m&uD>#uO^#xDlmGgn`hU`q
zF$`J}YLyrGiv%~mg^(mKYrrD^=9t`j;HV0$TBd|?FGRH6+AvCkW%l}r1S-Hv26z9R
zqd^+J2A=J65Ta}pE0mvs!J#5L<*w8F$>(sdtUgUVET~8ow*nvEhUK1Zz4$P_!*@wv
zC=fhMHXcV$tgoDeWzunj;=A|wFdbguo1`ItZp_`46A}V9-;VulkM_Nz&7QE+XqBU%
zO!J{C3F=m+X6a_&v(Y|>)_H9G8{SP`l*AW~jb|+OwCt>}gtdJWPrDqbtamB$zX~0l
zJ0_EXuU`C6UkjZFy{C3F4ttz-31Ov~6g&byt9(O94F`V(r@`P@9!GA9AQ_t*HoE{u
z!)YY<R{9|Pc{Xu#9jIJ<h&hQCLPE1H%7?vXc^x57P5#&$|J&Ii>9ko3U(u7W8NQNr
z8P>3>@XgP(|HhynRHuAmaqEN1(#ZcG(2^LOVl-I0_$RK0g&ew@#FaRUG`Ql%_FMa6
zbPJn06mk=SiKFG|)u2^vBc+#`)HQs&a^c-_#V&R7#t5#K?!|w>9~5#G2KL;fogiDG
zuI~k6gwxC6mbsw)yQ7%ha^=$ZFBeK+&+4tU3onyDKlV9Y`&-f0+AI?1l3ld>^WQfU
z2^+rFe|F(}Cx_hO2aBbBvm%3WQy$hHGHmQUbDe|Be}KOR?0<Ca8OMT^w7b}G9|X&q
zU(2mBIKn1od!C&K>J6JpnYec9a9>YI`-kS{?2$*Xf+3gf*<F=S+RESOy+neh?pE5g
z1xY>ljOv<|y&t)`z2RN21u<@VyutYFP04$Y2h`9h{{f5Sm*s4tRo1OZLB9g!Y=oJ+
zOkcxuiF1zDJuv0g!?nGD4)a?gqN4Th2|q&Ke|^$OKvt%Xm#qd1T`pwAe3A5l{`{Iq
z7G#Sn5%i{ptW{BiiXTpXw#O$j9e!&a-r%9lu<pgdHPfE`y+#54!DMay<@cexC&zt>
z{yJ0$d>>o{WnjO#l{qVoh}}Eiqi?UCX>D)c@){@oc6>c$k~Y;f9exrWe#CV8Q}lEe
z_=yr)X2TCokDX{~x<5YFgP%ip-ht@51~Boc!Yy1a4O@zn6uKb4d`<b>LIS+gRbeXZ
zhRBfdcOmk|o!2`>!9aJ-R{#aW#5SFLZqR<joDp>Le}3PcyWt2L4Jazc-jhaIPnq`;
zCy#w)7lI97fE@oB2d*+h(R47l1{X6PJwCjYcw3qg5&_{sAs~q7_a?3v4*m4h9mh=u
z9d7h)mb?S77XK#<U4}ZYdc?E5TOlj$$7Gr4**R<+vSwP0s0u~oPpl(TF;IGLQ0gcj
zzsZnQR-q%5YRI9gA@mc^_*~M$xwsBZ;wU#UNyRfS5-wCxiLeL~Q7-TIJ%-2KauTN6
za(4Lgnx$#eer5+&HQB~eF9cZpX#iRu&!9I3807mk7`qU>y8lT4au)g}bA!|VJ&MZ7
z*3O^(O{!?e(Gj=_9R(;K?;R&nj8x*wsKMn!E9>SalRNgjfa*%ew%LlrxyfWL*pJmO
zFNR=X&JHEcbwpZlLeddpeC7W%gDxG}DpI6y#f}w{gcF_HFS^M~5F^$oKDhP&=<+H!
zlq9`Wyp;V$`|2P}07e7b6waFj!E_<F1m-V9!FP3Ijs^m&zy2?{eJ==Dm$^=Jhlmg(
z8D<UcPzbXcvbfH)?{ED$exgjc6|hccRUWt+eIar1F2sm!41fC}h(hnZ2S6i|`MbLF
zXxY3;jO%uvN|?d`>JnDs0ps!8VmY7zy9fUW*u9c|XT@qrl77b<z2I`R2~;Yl2WF>!
z{)*;Q6wbuN%G%Pc@d1SVq|wac3eOqHhnio6EBhRsAA$oAT1_n-??o_}XlL5F=WtcL
zPI+JkFBFIx5XHW0UAjh)<oA*2D9a+~^S^D|aAxosUE%p+T-)nL+C6^f-nNTKX;7G*
z3lWTUYq1(B`#yk@bqXETapg4ATi^J$&>O^FalF#P1Wt(#NBax4%Px-bym(*t78OOY
z$g^rlEe$c+wT%t$_urRx4io)>d}V&+Kc?!cs;R7aape(c1UT>SEe&NAR6JRE(&iJG
zn)uv#CJa$p9#n5W9CyP9q^g+uo|E*iDhQ=6^lonh(24vKDEzMc*#QF1O9E+G7t}tu
zkN8P|n{@Av{RMklLJ)U}_C1|z`lUv}b*2gqyb!;FosaNV70$$r(VDkqk1nd|-aB<R
zw3-j!U2bgj_eVtx{qb(Hp7iTsZ~5w`JAuU8uVro1@vyde%slyf;GA8tOFS#l4_7SS
z`uDHQ`Hu<KD7NF6?60LaWo9<BoW>laoLk`o%cqRk2v(v?Ji>#5{rz{h&A`ZUdusS_
zVjrsZAuO!kN<?((!H@5YCtkY&W~qTDX`(httJ6PAceyL;7ebmWD>rH$va>mk#e3h6
za+JfEYs<y+LgB=}&mVXI=Wes$;3$*c1ejbjS3#fw-4n7>dW8ZerjwhLkrdzEot2%Q
zXr`7Nu0D|2v`^AGfMd2OJBs*?o(m$eJR?5TRzeWRYG(8BV6xNe_bUNUcB7A5c2{>3
z72Zijz*O)2d$c)d^V5IzWIp^<<z!e+@gfIiV`sLd`Q3n_7)H2ru=k+1HodmY*L3-b
zhp2VO(ctN}O2XXnvC0W3<spM~z536?{*%_eWXB6dX~o=U><VUrf5Z_X@N;V~A*YM@
zNVo$l=IG0j4jzActy9LT+mN>B5Z<cod?7<$3Koh-5!VP6as3aI|J)IJ8pH4i;jrP$
z>qEl87|8duSXOIwEv+pv&^EuWV3;ioVvoO!A9H1Ey?2}c(<UC1^d}t^i3nwce6+17
zhN66rR%SdvG+rD2Z{#|q5*#E<(=Vfbe{Yy;E{~(ih+6-DU^l1YePf`k^*CU2XZb;~
z@2{elia_&BT@berLWF<ls3kFXulOepH#tp>G})@l2)={yy!c@vP*o)w$CFi*Z0Nj9
z@R~-rFD);J&NM&*Ty<eps|qo1YOAWK+b(ndrTMEBW3a-Ks*1tZTAcxBMEp1mg%`p=
zkvQay@>Z8enq3u>L`Y0hc7^mofGnt)C@ao|?oYC%r>EzIp8R17-<z8j%M-05IOlpQ
z*QcjzPJ96zx;nJ{t-BkFyIs4Xd7P<aF`}}-dHZ=kX03N$WG|v{5L(=uD%Y4dhBqke
z5L-l_;-Jw(-jAvb>vQ!KABn76e|&^eH>(73B%*~(zxK_uUQod(fJA{_8qUZrs+E>4
z{Q|WQI9bAC!P;UD0+Gowv6>2<SNg6}AzfZ`W5m6*K39NU5!PiCJ6Q8oINegrVM`Jq
z^!s2&r~cCAGUblBW|0hFY_<U$TC**U?o+Og|C$5Wd$YD5ealU3Y~(Jh`R*ni3Nm-4
zUIa*|;vl<I_IceqCcj5)6vM+pt;`mHw~M=DUmgFx>IRjQ7XB<?a~(v$X6@0UaTxij
z`s)si#Y=-DHpe@S9p=@K0FE7qbB;Yu_wIGjCsFW!BK3M8DB%HG2f>Am%-OGMkO(F&
z2U=cQ<Zz}u8>?yrx&SZaFunPs0<HB_Nz&9uB-h5~l1?f?XK*`c{V3?JkimFs*tU1Q
zrJNJE!mh0UT-kZ*mN<Xp6a00#qJG<c&T?`Na1qOE2L9EAerRe6&zRgVs~Rbt;BG%o
zFq!P#Ob;{&G3^N3{8m1DvL3M830pbX@m(qKskXKZ{?WYSR@u?fbOz>`O9RgEdJ>|*
zDP%p8**6QmJ6f+n>Li`*|9{XjGhjF10JZXvxlEZ|uk+s-y-;Y3&AnO)B!K{Lzk4_t
z)P%hiC(MRXjbJJTr>W=S-qLweeiu;LLmXYbb{!KI-1nlZR##S68OYgVS-bYq<qY|h
ziT!mi(c;%5Z0TlTConDVR5v8fBP%nkyz*2h%MOuB3YE#$FKwsRZxbzIA`(TQ^MIq=
zw!gRD6YZ}OuP!P+=U<o>CGIVs^IO)~Z~NaiH3qO^k5$5eZ4MZlR#mw4OUtRimTg@{
zx=X&$bzk4zm`;;&4RJYOeFK~86n;DvE}@&^(_ZB=EaC9oUe5K;pjWnj%j~1zenDzW
z8eDvfkG9j>;PMNfcBMvmN`WA|6RFt_QE8x^ETTc$Rf-TCl#uc*onYqGfD0gaA-p)O
zf#gs~^2VR_s^w;9L*m!EU+kp@H%(0cPpIOROXv~PsVp+Z-PhF|PfhCB7d;JavIsxf
zcRK9Z-|aY^Gmk*VK|)$Pc*U+nnhj;Ybm5qM+T?aFY>r#RTIqqM`zfX4NJS-lohQOG
zcys-~`FrFd0^J+VBE_6xVjxitXU=}D;_lu`68tSpz{w%jD}36UKm8mYH5s30nW7SG
zG%kAgeZroQ&)y;<6jlJw6W`jKNUWRErivldVrXtdRdpw=Ajs~=?O%=k0_Ahyn0__I
zCeX&MoYJ{KeuJmG_g)(>MhAaDLEVv2L>O$XFqD20JcAz4o|^})w8ohW#Sb&JoR-Bp
zg~=py2g%Z<g<2;ap>jPdQzWfJj$sXV-@aReWz|60??@+}^Qgz=z$HR3r4CuVX6XL#
z?Mc`^)kc`DJPT9=mA}g}U5vi%`lb0@tazg)iHUr?B4&<F_mj#x{|emo>=5#+7aWYs
z*EzE{S3E<62MemszvDxJa>m2JgTk~F#hiQ2^HjGx^<r9Hxjz@YFRCsb3^xH)YT2g?
zHwQ!BXBZ{yCibm-2aBZ8{e^VOgtm}Rcrdra=b8$@5*Ci=RFKgV4?IIKB-CuV4{a(A
zin}g}F#?S=*dY80k?IJ0YVuoi%kJ-?bMoA6q3bwL16G0oB!5g^ySpe`KqwT}Blg%`
zn(Zn+pL7l&ugc1T)YHhay>J!+pB9uR7R`boQT_1D5kGRMhgUG8l8&%fERo3nNEgXs
zy2o+pqTsP!mj+{oy)vakWjIEP>kH-M{scB;REGuahGjpT_$*BYYj)f@TCX+bOCZi*
z>{3xKrvY>5KAYm5U9x^~;8Oby9$WtiOxTbtuRv@>O51&_X_*04KA8;PIGhjnQREtI
z&J_KV0cW!uZ-;Upto+W7qWaC}0HrBm+|l3<`|t@<NA8ICu*kycI>?1I2ZL5A4fBee
zI=sZ&s{m9F0F`Gy<Zm#%;5}+^-+S(k)Wy(2S<SZ(Sw|l6<K8TXz%F)wlDj!r^UgKK
z*5nFxIhLhaK~cYl57vLS<LeU<(&mcy=6~2rk<m}ghaPESQEDY#w2<F{E3GYUZB6sD
zMp_OrNDWjCd66^9z*sLgP-|^aM)YK>v4hmJ7U1K*(zpDgOk5hj+?TbR`2Q9|+Bn{D
zsiIcPk$>jPV0yK+J!6i<c16DiSO^n%bm0Pl0!X~6D<dSvwq-W9U|fU;6N-13N)P<D
zecUs9hEZ8K-D(O;W+t*0s?SSkr44Ehl-#-^p=AKNY~iHoJ5pzh*?GIlXR;Cr8<U{U
zyE#&^K71CnQR9YinsY@|zJDs)pXf2);2wUu_mwGS%+&RgL!F3fOeVI;Vpyn0r{AOF
zxG?!(e^RQcIpp-`v1|g(N3U$LE=QA`>CGOS)A+=(mc5Pbs}624W9I4WZqj*nvGAzu
zH*fuJaz|G=nyhd1X7xVun_hjg`Xt&8nIZh=Zshu+?;|`IlbH}q42|SmWd*RW%uJY3
zb-+(Xsf=e9d*z6wL#-)uCeq{X8$5L8W<qfg*F%(V77N<-g2+9i-QTt|@Q3W&2VOv!
zdhs?jd)Gh+u-c{4;JSjYhwA|=a13BUqE+#Rzaw{*4T;FL-#_0lm2!vg500Enoz8?K
zalpi?z<-9U7=b9fR!sL_cv};=Yu-;$Y$#ivK3;Y=O`X}cS12DT?ZF%E{(6C)4x(p}
z76CwT{pD}wKC;<Q4^z-P6Y)34U<^2jD*T60uiQ^Rl*T=w-Mo}5kdT3<GZ58CDZW5+
zXs7aa8l>}OVhjsES_|&rIfsgXV`#vq37Mk@Wf$c;{P9xo2t89{zhZ)X-Y_=J?>GW!
zY$znu=bW5U2jsgn-Ow|!Lj0_#jEA;KJUW6}f*0<3ao9#dP@Z!gp@%8Ppvz!~&)&s*
z!PZvRO$QcTf|>0<;%xEK)cE&5$wRjzK7j#7dO6@Slx51Vui(cJA9uN*#J}%2=w-ud
z<@g{93o7awmftBYO+O+@nz%FegLlWbqNMthe;fAo7vL;pjLAZ^jfxTY@1&b($|-T(
zJM7Wd?@=3!TL#w-uVFbV82K3#-YR)mZ!PE4;?psc2TacLx0E#O>TT@t>T8?(6=enE
zTCh1uJxutu#J;TA9?5Sb%BM>Q-|BrtM)m-yKJu;b-Iyd(Eg}P53^I!Ok=WhWqsC`>
z5mS1gbBQhiucd79G_XKt)4a$#ssm2k+;ica(d|`PgXQN3KJ11y&NvsBz?-Wa2$6`*
zf>9!1!*V_QguqstV$)s<U$XoiD4pOCyT*fi&ZKjb0<g+{R(6azK8AW0?357PdnzdG
zi}^kupRnuj7%8n0mUWt=t2Ehqb|1vB=6WMSJ;8<6UO>ZY3al+%mBU(1bgu7Py>i5{
z;J*f0DNqO`BwZv}Vm>fk2pLm_dB3Wbxr)Dow<U0(9uJ=bpZJH4HU)8)I>2{;ip4^;
zFpIEfackSu^sMbzF=VM%lW?_&8H@!A=>{PG9v7jF8;8p_M=tLz!&(Dd+rH(Ev{`%^
zkaN)82p@1(g<oO!C>xO6meek0?Eb`oz5K1BA4FHC;g-a}icKC0(L%>F^;Min4%=Ba
z?Br+WNJD*DI;~N$i?iH`V`bFkr>0l7>}Pg+HugEy?g%Mt%}VN`4>rA2wvO}N?^)f2
zdO^Wf%%`bMzLBYK<>&g=(hUb3_`+jf$vE!sQ>s(bR6=%VXNRJh8XFJE{bzJFnr5w4
zp*+w{Ju6Yt%(9l}bk0Q)V%X{1;;OP{!JClxq!MnTumiRCJNu=OJD7h_j?>sxUd^|{
z<;~73E33(YW><aW8?tL*KSPv7Hx2IwhM&S|RAIN+HV4b8@Z<G>Gz*#9eS5Uch^v$~
zg&fkZib$N0WaPy_j3VwYP18WkWQJG@s;;)rAA>WN9=?-42Hk~_0PtfItc|smnTbH@
z$=-+Hz~h6)@Pkszn8U2ZV`Lk^>nOm;Y&PwIC%;}BAB3!^;WFMR4i45>R;OnT$%~4U
zD+xweR>{HFn<dm*kWtJ0aeN5K0(0LF;I+b2hM~K&yO2n%1fYVWBfSup!S_t2_Vp*9
z?$=B-%-x7>l7+*B!e7Y*VutQ-EDDbnfyUmK%A3@aU{qgw29HcBVCptVa2DJANd`Q}
zzoB<-T%=sfmz3%R2uj+RH$Ef<CS(je0D}2{(<OU17tf*my=~qts~QAZP7lKO@9LlK
zEFaRp6UMxKoof2JCu~HvCG6=7r0?8t!vku_;mLDDL#CX6!CJML-<vm>M!9Di1T)P<
z=L0A!s~+K}LyZ%H@nh0-pxJnD3I(x;pfw_t$NVNtmMC0PBbz-iWR!F2SuvC>55NfF
zOs5s{+JBEc3yNWec48^uhKP*hfkGX0VSB8^#x8YTYAYYEO8P<Ja?kN|67jJ@0*e~q
z*O$5A)idB$tN|zXcOsNx;5>wyDV{+ZTBu#5K}^?!Qi2pQXtfP2t;TM1Kfj&fDWcyu
zvH#t*SoOnoDA?hb`s^Mjs$AX<d2jAEFu)NkpCjjB=~+yyDT@_%inJ+k`>R!MWnB4Q
zU{HJlNY+8*jftLKy!=85RAM^eTBm{Dzx8#`mFR-1%>B>u=hc1_e%B5(|KK5v`yR)@
zZ^aH^6@1UAK_RwQkWrK_5`*{RuJF0Sqai0TfN(%-k&(NJhr;Bpz)l{JBNiWKFE-Ym
z^qXDx9UlPLjCEi65sHL}EPiDJWVEhEYhUd_T;8YUt%!d*EbqEd)V7PW$0fbzaJU0w
zX1^EiIzj;(Z?cLH@lTvU!0z7K=(ZK~nU5B_A5lGEsPmHf>521N)@4l-vkAV$zCzO*
zWbi{Xyt2c!)<SSa>yG#{ka|;+Mc3JUs4AQXi3d|lD6>}OGTBQep>S9K;b*~P5X1*m
zW!Hc2xl_|%4#Nm1Je=j3najB^|39+<z8vnpLr(I9*wRSfC9S#&9sPaD3xxcVcK|jv
zAOHpBV<N~V4pS~T=nJ{mWfvJw|8Z<GZ#_rChtA|AW^Mmjh+~7(LNSc(?!c>gIN>Rq
zEB?yL%H|>a<B^<+r?uviljDMZib^Lv;in3eXq99CaDfj2o4vIQE?(28ltT6|vRkH_
zOw7o+4^{5qf|~-`dQ*`U9#JGuzfdegh5$IXhaIg)uY?eB5Qt`4wdtVkR%^dXa67i5
zA=+s7!wj=NT4U@?m#o?cmAvqyla}=m^C1fS=g#+gzD3Zy1j??=Zp07W(arT&%*{=$
z0)&g?$MwoNGc}(ngOgPz(MF;*2?hdvi!B}}Fr=|JQtW%{p_l(D;^o`S?O(sV+p|qf
z7-%Z~eLMKq>mAzvT6L`L_#`VUan&BYjlhzZbC~kvFOh*~Ym&_-Ko!yj>w{a$Y`y0W
zOk;PtG<@-rVd}Qnmj5mjOcf<t!Z+wW@OHO-Re@6RWvBY|kH=XG`k>^7bz8t|$B?#J
z2rNP>q>Y(}jc39cTdDP~h%noC9N00>%gQ$BDB@js0ro~XwmH)=Fu(qFQ6wKNTu9*c
zCsLM)*3{o2aKD}rHX|uSig0r!fH<4_{L@IITMLisyK%E2A5>O$hAM2S7vPM+3VwY*
z#6N$n$nApR>mu1UzX|h1R9u~2FKy&b=#$=i9MRr(`j-Y||HDI%kIu4HxE^{TXd-DM
zYbtm#a63k?Ad0bJ?mZ`*XMsB3FF{8F^Pg=0xnLoOn2h_w43YP*-ijWh_g@*_1tU5c
znXbTBK=<&HF=vD_A-M+=6G*a!*lkJE#XuP0VE+dNL3>XY=V!2fo8#?yar={*`C}t|
z(!|mbr5b%n`QuE>OqD{wHFjP+3p`}<v-HgNLanR7+s2&$#iO6$M?FL2*#c`=;rTpX
z{Mo^fnB|jB_ISW;if_Q90R|X?i$F#)<9xsKN!qX7YbW@wo<m2*jHQIL^FuG=P;{=h
z96p<~71j**!r=O?1Eoc%HHc~i{AUI&Y@F>G<Rcudb#Fleh8FOfko3Fc04&<OT@**V
z_;@6TlMw^MB|JkE<T~WC7K}FnO$#=fwZi384pasH=`dJ=ViF;6oew`if#cKScUm~x
zX7A5{ppbx!B96loU-|eR+iVky@Yc}B0_Ra8o8tNE`jkz<+dBQz-{4tMKFg8*%F`dh
zTa{M)LA5o|$Y~5Ob_e&LK{ABroWYb{T3p=n4hR>9Z`+*yNen+C56XB91W9K}bq0$B
z1g|{d-tlK@vEyXmhe33}xE>d4$9VuozAtSJO&6Zl{<PFBOw)1V$&!+|bvg3>!wSLk
zkZ#X^^?n3TV#XvCMa#1`T-sfYCO*D^i9xEe7FU$JM8O+K0i~O*V%8w(>OK>+D(9Fr
zr1h|yV^G$>Fn&T35a9s-IOK0%`|5*L|LEIJl~VEneTc^4<{xKbBI9rccK9Ku+J{4e
zRBw*UsE4~BB#5oyyEK>pKfiuC3JMa@(vVeTNR)Mj<FEplwH`6C0k{^zp|C=l$Pvd}
zMQs+`DEk!ZNy-2VfRj_*W3v=TY-UOHt+s|Mw_6xrzcP5uA5_xf#6djI5wCSd<S}tx
zr{`LH1bZpHI)u~L-^9$+(eaVS2W1PlHh{c(>C!1Wd)%E^Uf<!-wz?Uhqn*iT=;9h*
zOO*d5m4x9a9Q4jEVhymxZw`n04T`U1vps)2V|q2`QSW2LfjHs2hA60`lXlwjA)Uig
zn}JS%hXC1LIVX5^MLAra^WxOh6rd0@25@MyMh-oRhx24RX<lSV#T;(}wkkD<2={ES
zYTh@Oh($*Pd7f@`919H&GI7t_J=AO*;k672h!>(qpgFd3IOH{ja)wLPJ~2bHmggSF
z!|e&d_JE@Hv@z9?1y_h980wd%fTb#!9*`8QxS1?;ZkcetINwc>Ht}OAh=%zYR>_nn
zF;jeJNQN_2+}(mNyVri6tnWKgpEWjfr+%Ah^eB)$hssy0vrw5=YW1CkYiYlx6!uqq
z^7(yDjnV%5)1KMclq*&m>%aa<{jzKNU|~7S=Zt?<h$szwb%pnir<qyS_Qr_HQ9nY)
z#uCBbm=&t}b?H~x6N_S#<VVx;e`zqCK`M;hfg_QfXMVKqcA|CScv{+<KW{36Waj9O
zyVca;v6>4*{=MXw`?D9+=cT|$>c!%%#W1_N(}Tn$)K>frZkh=X3JCHX*6>Q=6M|&G
zS)#gvn%t-&+APF<kMP5*sE^<oEuAIDlvy>qDm}R1-JvXEllw0y7%f!&$cT4)(es~1
zOHi1ciQZI$TIZ%>JyEekal4HVW@BmX9fdcT^m6i3;Yz=x$j|zKKjQfu7N^P@mDRK2
z*koO4I&yF1d6IKx>cqD<l6F6%JvwrRN+9Yw8OGCyoR18}Ti;1aDPDLI2|*MrWK??#
z#1xtaci@qK-D>N&zDCevGif4<2&@t53xi<9>as1)7|HV?Llk5qedXgOd~3g*g2g6S
zeXHU+U-RzPFT@EJIZ4c80sWeYpkeBNQ$PN1H+;|e??Q`=N1{pntjKNhdP}3(B--7@
z>ucWcQIoz<#Qn-zi8UG)OMTJy{b8hSX4(GM{=`4W7fAiM$Y_IXKJCmeCO#GVhB}E6
zQlv9?!~jbmIDtL4ErN8sy%h<O_k47=t9qu#=Y*@U+ThT~g%)y_wHtOR7EWjLf?(q$
z1M9dU+O|tDz`rroado)%5u_PX_)IOZ4pI5k1M91Uk0B7%Pr2@;kN`X?GU<LuI~P4X
zH~isnlaE)b7ah%8YMIN7PwG_~RiPK1d1@|N)UvYahPDqMJg-!cTv52O1~yvEeJQJ%
z9oQw^*T&$AtyU(;SZVqkko<L$ABwXZ4U{*iO#0!SGl?4!hak9NA%0){@hk2iy8@hz
z&3poppUE6Px>#pyC{9QHw|BI4w33CmR92UyV?oKTcc0xdyb`F)o{H#^yeZ~LeAc{3
z81EL+jTz@~NakZFezKrff+5sle>5|lgt}F=1Lf;koFNcQ2099kwg6S|C{(dzEU&BH
z3<*vvw7t0ZqxCjSJwo5)*2hu59JGM;FTQin8YdUu)?r{m#fMe&T!!5UCR#GQ%N{ZO
zTQzQ2+4@IYDb7%r0Y#h!@FqS60C#uFaiu00if0Xrb^BQj+64q+X<(vP*eDuIj*3t*
z#-8dSCeH-cf13iAAfwXmGxekl*<sMWe5eTS4(d1g#iz4v?sE1L9<N^eIBMZOQXhZ=
zx!OoIM!i|VO28lfA*(&?)3kwnx%W`|W*=EG?oTbY6M@Qrd&_ghOQrPlB{;zBChD}X
zg|{%si~D3U*-gHsy%pDK$SA9}x&Ago7a5QGh++~Vw#<T2vx?O6t4q1WKvsTdT+Pyk
zX&Zo>)0ku#C5!)N6$>?D@*;>27}&D)e*MGhP*$;$1%4;O7yl96(gr7kWEf`y*Ys*D
zgk!0LRIziE+WpCeg@@8RoySKM?$aZRN_cxXhu$OY%$u&vw9aXgdW|7L?=H5Cl~bvM
znk<?vJwh?|3r7==YUMd!kSrDz<EoxKc@($rQm<ruA>y~E?3Zolwz(oN;mX0-^`bj*
zlA(b?-}B0fgsbO6QgDhF5Mm8x(n5oZqGFLyd5<ri@{4L5(U~P61y7T|@cY0XeH(6V
zb-pm?`F~drwjY7ikdh~!MeDEkenC;@a0z+rZHzd@ZDNN@^+&&5BNbeeVwV9V#vK)f
z%>)S&0uNP^%*+(o&Sqs*ZIipId}=vW31{IUIIf2@#E>hN5Bo|>jr`65)dZHM6nd$+
zE_oW;;bP{T*%o@(z<F7l<puNkZe$EhEe&F(!ERr~W@411Y;t)))XN%9jIvf;6&1Z=
z@c<RozWN4|-D{R!D;V2cMQTpRJT8=@)b=9`o_^=f%gf!RZm%pf$t~B@Z<arBaG%&F
z?epk?wZ~b8M9_faxX?2?NW)9`h$_LdTpvI{j@UDv)R8Q%7NR5Z_LS^h={J&$vY4+y
zx5dt#d2$UFndJTS>$SYxyu`kqo@1+-FR*+AQyk6RUpPZLh~zYsrlYmh*h0qYuFx@u
zgzBE|DXq=fBoZQyLD>WQ($89miScN?w3Pf#bO0*hh8G4kkVl814}LHB%JaiGcUv0V
zxw*NNG@K1Wc8@&-YI_gR3U1P@T}kwMywX&q3pY5t2bz+D?u#>?(TJf}6+Fus0SmSr
zi7uz_REI`9?^KUL;zoO_CmkSdn{#KN`C~-6ICK{@IDFU@C?JZz1f5shbNx%Uq*F%#
z?nh&1OSPbo!-Wl)+TYg8-3D5%{w5}Jl+nPP;`+@MP9Ay)<_??|0)aw^d)X=YLRm(2
zA_Cg(s_?GJuw*$X0$G@05DRAvfS@@wY;wv<+y@KrzXCc`j7`XG62xpN@ud4gx0wFh
ze&Z1w19K3q?Z>f5$8=#YBY5d1{gkwmYdGyHPd3uIr~FmQ+Sw1TL>%o*)>Gd5D&|>!
zW;25nG*?!<u!w1C8S&rEE9NXCuP?3(#T4ed6O1*{v1BnlUY&klLji-gbx;0H)5`j`
z`EYgD-eRPMJddEa>y&#(HJWN5?e#zO4Gvz;xn!tYB2jFmBIuC4t96cD(+d$5*Bc2h
zAh=9}W+q2vUrI83+V9vdTz(Z-SxYkKTI16OXMZ<&14C0iJ=Q9!)(vgJNsHCpXk=`2
z!wU-gcL&`?tmZvf{{|0?hx#4)!m!LavwgJ(357haY^l16lWn;giTtJZrYIyZ?At1n
zO)!;ck4cub%k~~dMdHRM*1tB~!X+7&)SK#TTit_^@$9athR@+TH#}}dB#C2SJ-ElW
zXdnm_*V=wa)yR=n<;eGEVIVpSNIYT~<WP+)K1En0R&kKmX)<V=1z@S{2~Vvktvzr_
z;8@ujj#m35$e#iFsi+bH;?^0WATcn_Fg7@%TQV_{HTZY-OIhV>(xtN`?^yUF0v(Nk
zp=@-j-{H31zHju8AFXu*0tS);7}t>bhHMRk?rRYR?Ej~=9t2cSbuRwd-(Lq$J`4kN
zEI|BZvN}vE?^Ye*^x#nCXg^v7+-=Rv(+AT_s-7d;4>y_?gCa&l<e3WJaBJu~dwHP~
z$e`1jpI+b^U3Qf&9|Za~nR^4!+&06v7v9QB)mv8I;*0sQ9<cFmM?ib;Nq9HTQ9D`d
zSsJaXd4j;g<9}@&MaYZ)A${qL`*?XXRBw+q{r81dx6Ei_Djq(2*UW?D>U4~JIP1$5
zfYDZA)SJ-s`aEpL2+=RCZT*C=H+Ow!Jo5$eOY$W>{-Eki=Usk=-ibvwNe&dW(ku{Q
z*Z)DX^-2<VcY`B=lPARuz*nqp;*_P_(YfuJLscHtIrYOc?W<pqQEG5q43{_&$LcpB
zY$jT`{>kFj#|W|SZZg!#@pC!X3-|Ya9R$%;ZVliJx>7#VFtsUhHee7s8T3!;TdNaF
z1qp>?57EG)dtlcgmu}idkNH&Vje6;zi)4T??2o=%{uzT_q+@{CtG&hR7d~t-SA`Y9
z@hmZ3F>sWzYv|5jdwl(vp-^F}1V_LR#39Fr=bhMSxIGR^!%b(foOo}Rd=e0X6}dNQ
zaFK;pg)8w%b@lbaWJA*K^Ca(ji`B%wcJ4aRGQz{V5s*am!Th0VxKw;-&HxeuNmA2Z
zdd4zmaXKiP=*Zr7x+Wi+B5JeJNX<e2C0B=S4@G-yNP_02Aht18eE7_~w23S`noAyb
zeRHw**K{x~di!gAf!w6Ylyd}lqhqni82!?Jo{E0%t8mX1YWWMu8E8eL4Tw^O{xG9k
z?Lf&Y5|^?EeTVIF>?j0qxf##V?0V$+bIsIApOk+bN5MMMGH<ORvWsoh;97R(CZ+AE
zq?@kSF!x<~I~v}64diEdrc%nyUW`rS*Z$S!@voU`0f~_2u!}`+K)ks;br7XoK`><3
zTftb#K`pWuI92zof6k=+WT*R!kKsAQ*0M3j*vae#;Pe~5HNZk?m<suHSTHf9s2~;g
zg@Pw1er-<<P7Zgu+xOR`MeQ}1e+1rAH6#pATQjPQLOgew4natr$=xVb^P+YVpLRB1
zCss?a7-4^~GkX*7$r26iIX?Ms`Wm55=Ru?`HqD|O38WZ?H&){jU|z2k6ZQ6!#1G2}
zNg@ZRYYV`)b=WBp{z%3i!B`k-?y$<kzok+NP~D2P;Jg}w#cGJ`yt{P<lmn$M7|OOy
zFGZH|`r+Tbtv$RLF!=Q#>la(1;PPCF%uys4RR#g@>x3;wg(0Hyxrd|El6B6}Kr|3}
z`y#@ODUufozbeev`5BoFJ*5YzKO-IHZoLUHt)8Sl?c`!->=g@ks#~YJ?wS~5>BLK0
zw+cC%=QJQPMtt5}RZpUnvIB&9jA6(LzZ_Tj;i0IB2>=`IcAH;{QbU~8%vGmn#$DVi
zaV}cAce!_zw=|UixSlc6v4#y{*qr(^g=2#EY9nf%BCJgnAfwC6m;2xdG9PDi8k{vj
z%L{AxtCHkywJbdtlbl6$t-KIn{WpI`a<DYLAr+o1cyUm1%UW=<H86DZXVz|DHfd;W
zt2Go`Y5w&ROOzKu!+9cm1?V@%7kqR1Kz3Kw`w8vEp0z>IjykIcc{f}?kEonx6l<py
z1DBj+?gh-2p&_QZq2qAl>|-CPoXgr8&t9T6I&-9{d8=F&`4isaxx**{1J?Vr`jcF$
zMy!ij{?&r#6;+|H)*llt_s?HB%g=~?;BGeYP1ZlaI3d0sCJT!o7&56(RcHICb;37*
zHRVlxXW-UGtJ^;gyBHV_wvH6Q=@dh7B%qy8ia17sNIBwBpaJ15%3S*ex`6}%jA5F>
zl(9P;?v=dw{)zPfMM=Gh20eo!#os+|h!f5%FG|T_;x|CoRIrdb^r(KCgI-ibbgn&^
zThl<44xdE8Xw&3GLsl9NMe{l!Ft*LO^@rmPL$9BgreY92pRH%VYY2Zz@iP2vr>kiu
z15~bZUoD&+Qe`<!O=hhvzgt>aAs@<?j`IHaJTXCjr@Z~VEbB_Ze1m<5iVMF@Tl4W`
zUm{?(Egn8?oaIWG4XT}YYsB1%@SK%AI9Xqov18So3v|S-xD!4S?n7Tm4xFJ>m;nB$
zp!+j_3n6#OsK1O)9ueeSuSGezO;Yldl{$oHs%G;N{3ht^?i2Pyq#x)yIp*^INNvcd
z*$Mc%ymQ!N;Oc;Xfw*@)r9ikaW>@b)z25w850VZIi9kj`nPc97IFUAvb(EHtT?64`
z?!R&Jq_^WG@*!;f$>Y@(ZXw5XHlH6>EH5U;-Yz!(2?*HR@?kkX*vaw`c~aeQPc;1C
zm?_#_DQAch=Y<u>9*tr&g!Qdw&18we+P}{-j2U4v!yX@UK^T=h!!K~;I&Ynto<^>f
z^L{&X2gcA7V|XKqh9^j0%y3w+kccgv?a<uJ@wafUl)}JKOG*%H9w=JCDJ<GY<VVFP
zaY$RO)AuHR9GzOsd)m&N!nwH?pP$obBG7-bueyQ(woigjySb$*l<6>3ln=!R6!Hnp
zRkKeTbQo5Z@`d$k_IqE{KWUkbQ?td>L~wP=_WzRo2n7{9V7NK&erN}IldD@Y9r#Z%
zpGM%bEFc8}ZrNO~GcYj^WA^?`b>ySU32?o6hVpooYl$RsGke2~o=lxzy_f09N@GW%
z@V6xwE~44gl=H@z3x6?-j*F{MSIXk22chzpSLk=Q>V7j)0-2%j4K0ND7i{Cb=a@0S
z#0Dl<dA~FaJ#F+Ng=t<E<HFt*yU?9s_;l_zFD`zpkG$?F%MuC4On*(=fpr%MZ&V-&
zF+dY?1^m!F&|ue(qYS8&-v>kG4=6`LZGOY-PI}c!5sJ2owoWh8lW)K>{7*b(AGgW%
zu;gsd1yL)<Gm&`sIfOcdPn#2ZgeNc};{Nz(EsBA6y*UMi1Fsxs3_<{L<_A033p{n;
z0>r-(zyIJRd7;rX^qn{PiwUI@@2D+{zE}E~3-i%YwgF}3l=Sp6mqx1IOk6v6#X8p>
z@*I$6r}CqUs6nohU9xy{`1=U^>->Yar~fv%Yx1)ias@z-+~f;kGNmpP{y9m}&F3Q@
zcg)-yU}R~v_D~5wP|O($+hq#Rb9yzQUvAg<03_?qLPih^sOJ#wi!)@e$z6BOM^iG@
zS_>J5M;igBYipKf;R)Tc`+Hs?feeL5`{DcP|1Sr?YO|<WgT38YB^ygxY9!zBkO#Rx
z<ok|NgsRZ-(H0o5+jRWjyc5nQ{twTdNF=;(74AlEbt+KA@}qm6`B*F`4*zYC_k<EY
zP4MO5y~$4!az1IL$PN|o(dh45P53S^{uh@VpqQmsavriidan5LM_K$O{X1<lng#*E
z!7WPCePrdBZi6c?Gz$*YgwEzGaNTQ}edQC2e5CldI4I2eM!x8u&B4Jqv5WA0J47H;
zW6L!Z&>J$ynJjN^8pbH=jK}pVGCl{rF&Ux-#14A}j&Tiz-$togP3>68z{Z|h!RZj>
zhD*D<BgAh{;Mu<v6aKJmD#kGjc%sh&5%cS!z!|@pp7vFF;s=Q<kbCeVglpk&Co4n|
z#fRs?-BDFvJ$~9Ep<|ekBd(J%<P|OCYw_jKVaA!0@1knCqe&&rN9>f`>g_^<Pw!Y<
z>aA52+l}=k1ShvMje_21G_e?H(Y*`X+ssSlXl-=PrHQ;2p*O*V31&={jw>GvkRP*H
zeX8)zIJ-Clo{M(%osxv8t+tvD1O_yy-w}$JI>XF*2@=sedShNPrv@PY`bwkOZq<8D
zTd2IoJJl`@ZEu6o6aqE5==*hb7i=qir&j@_>?$?KhcFgbK3G0DxE{cH+s-IQ`f%1O
z{QL6m^ka8PaKbOBTpw1TyllJ@fMZb0K<c*jmYRkfY}I89T~CP%`CayJYht7WfSr?^
zg=8tXB+y;^?olHgc)C6#s~6`z-naqMo2%g`M|YF2tEmc+Nf8YmoNF$R+en&e#Q88)
zVe=;q9U%vXQ&KpbZ3q#So5%^v_t{~b*=xM<HE@an^$~Zpx@5(O)`F=~lsvCM81qd4
zi)|#Doj7`$ic3U(1hxR6Dx)9@t5&mpoHDgoqp-QOkP4}}XA*dNvd@i7@or!#e~{xx
zbQ0aes%bI&RD<6hllw&zQ9FDm-*W1zqayAYnb}OkW$wzrKlgw>|I2`Ob^HSUKC$i<
z%#QfnTe5*D>8{6KPd`zgGj`75Yi?%6Cj}H1ZaUz0v2^(fZxJrS7D7Xd@0*TjpyDU7
z77wwCIiFKL=+?UUSJ>p{#R)4nj#kui$$os^uJlM-QY59ZT9mNF)HiU|z|0<wIk%9l
zkg`55Ihev1t1g5i9r%{DZhHMHyo1(%PaKsOdQmS8+BPrI&F)!cWr)JS+~C#B1|Jd&
zX87$57>a_QNWNb|pdF1ikR@*|><7XNgm06_9ybC(^L?T!q*gdfzt0Yl+i&Z{r^3&w
z`Xk4m^wf}~M@D^0mefz?OGuFwV2um&oZD&p8#SlUSs<pJyy)8<_}@1->6vTAzRGzt
zu!DvPovV&CNs6!xC)Jz9b+s2&^E8<AS=E4$C<;6i!3a<`7Xl%SOX@|);F5ZOZJhl4
z38FeH^j=RA&#FNbA7<aXJ^`L{bew9Ce3@iVeTZj)`vd+I!KF;Cb7yHDQl=JWi#TV$
zTQ>UD&aLK5Z7v@DJBXsUW2HlvYHc&^`h)Sz=8h7-@A1a>+{}FH&W!skCt1*&>jmz*
zb~0ssxm=So+5mCg+1hOOz;UAPhPD<#p8E1S=L-XU@?!4-ci2%kCmk%e?2_*=mYn0U
zqhRL?kGdGS9poc>jau>^+`dp6Dc^+cKL&rW_8`=2<_0<mp;G#NkHCW7f7L&0J00M}
zrYj4CV{3}C^~0?Gcoe<B-_{14e!PpE;4Q&OLk6-Np(w_izwU>(VpgB#+_rtkjG8+S
zMv2~FiLuBelyxYK0Fu5)M%ljXOEDM>mbH|)Ke6KLLcbJ&2H3yf$|E)3<(w)U@QK%X
zG5wEf+rxqH_~P(E6&08$zy^RUxb|_|ehom8#h@`_#q|Vm%YO>5-7gT_;7g#{ik`LW
zP|zb5%2dxia^5=llg#|JUT@0(a5k)EUgYuj3smmY4f0Rlk2o<F4KG%oVRi=DQdS#F
zk&RXeoTO(Z<JZ|E-2bo4dz#NHvnZtAqAY)WI6)u~`;pxL^&sWHw6g|s+#St1T!}+S
zAaf}B$gU*oNGhKUsx={P8Qxqx+t5QEeDh_YSJ(N0e_JgF4@6$mON_B>IhaW`^AUM*
z>CYdN%-?zw7~HsGVhV!iZ9LjSfr4=C0b|t8zzjYijR<d!lnld2X1V8rbSSR8T<#U_
zW;<IPvrgUp6!Fs?a-UiClfjy;Ig>u}(-x~r(-!jkvKtN+K0BYSJKCF$Cac@8Cidyv
zEa#*6qO%@pvs`Bm2~Z|o=O=7WtKWe=gR_K452$;z`42mEGb5u*=xxK_XN<6&Cq7S4
zFFpQAxQsP2Ap$9SBtu(AID8}AyuBh>kP!irPK_Q_r+KHxr{`_W6UJ+V$%1GLpV%Cd
z8Y1=EN0jmTpB4E8OibHhv!G)1gX~($^;gwe-)d2e7|8H{f|R!B5FS{?E`nDFNXJ7=
zBC(Y3O#&|%1Tr8HcqHLIiXKIWaqeU<WVDxz$#=DYsAhf)k579e7%Q0HV+IrY2n}Aq
zzqeqTQ_zU1U#Xb#o^!yBOXlc;5sHBpnihNIk+UQav0I;;+%BjplUQ85>CtVD*S3CG
z=>5GxZ=Q)nvr6N@`ag=!J)Y_RkK%JFQ({VtkRegZCS@2g_vMmnDEHiwTkdzU5OWDp
z3?Y<zu6^AvxkQF!l3a(mB$v75{(Jxa>Cft6KA*i_uXE1xK*EqRS=R`!49zrFKU~KD
zz+sC^rN}&3_?Lf6cB^qioc_GX^LT<xA)1Y^9d{k_=YuFE(N;?=AADnKC2c<e$c9p5
z&GSNxrJmZSqMBb~v#4K1Z-g8iNv~{!?cHb6L?HifJrGNsKAvK~__Y2nQUB@t6fm+q
z;&`ElDd4!C*XT)1wqAUZXqG|Q0L4~AMvkRW>q$9KT3+VLv&JX%{u}MtqibRr3qQOk
z{4n+ZHLW)ABU0>~c%u$F+Th6rro2&jtGwGHZ*J*8?{uA3f(ke)dCF$iTWDOt@(Y8h
zQjg<wilCaNi9JrUlo4xjX~H`O=IE6rvb5U2{TOqHj1A8yO2uWi*<xb~TtM^Cw7v%3
zD&5X0)r7Kh0Fra92TIXOPzOHSZ*o(+%CD}1JWJlt+~2>)L4WkR;B1@Ws!F_tPDv{K
z|8m^&6V1m>@@n)6*7Wis8awiMfp^6FMRTxIO7ItXk}VeB{&;abyU(XPFWMi2qR^>^
zDLt$UR$S`uUv<Qo%imaAQ#faS3IBYTRha{iz<zLPK(bt|@-SVP1P?1{m~HSeaWRJE
zxiueUoloR4u#5i4)SkPNF&cWXhKEVUQ3}e8`_krV_eOzVOov5}!+?}t`x=K}95B8r
z7S-(hY95v=H4uPci|TkP`Bt}^I3zlUuaF8!Ry!P);AOZs`)|;@yM8MB$yvRm0!`vI
z6pMKTdv8)nPgNtY95hl%b_G=cI&a?nsRz?0qmP6{OVEGL_#5_T^ZS9uK<99E*kA8)
zUS-hH{%u9Sy1RVE`~Ee-0U_*Iy!a#6+6i+{57g042UFL7HEP%Q{{8bSDIQs#|18IR
zC39OJC)PQTt)P81`Rij|Rk}#zy(F|IXk;G``7h9<bQBV{l9BNhbUU|&wD|IWqtc|;
zKt|ZxlayO3-B2W3>jIcc@8TNju@UF>&*0gqM3e%*u6^>xc4zo`FZ7@M!J|QCJZ1~h
zp0oC4&{?b_u@jNQF){f_?+eov6D=|K^DJbjn7Q#BMUS=7VohgdwbnIbNjQ$7DCJr#
z=K9s%gS7dDC8yH3SGYA@Z=woC@AUh%1E2dvqKdqVi1#Tm9>+5sbwI^c$IDJfAm5ma
zBiqPkYrObG$yEEFGNV7VhilXxsyV$;vF2dHgh8zEncmvNA;*V%X}a0twSn6U05TCA
z1u?m43GbACkmkf|nqS+{PZ_4!H$8`|S*t+xaHHO<)x?2?a?2Ln_h|ig8ShlNyav~b
zE-jJd=Cmewiqxbjbm*<R25}p1I?wx@6*0>jHmM9*$+2U4QnI>V=5k=sGw-8|yw>;K
zmLxN8y?eMN7?_{mTQ&FpfIsJhswGudVoQC9Ij7ykkHKGKegT2rO>ZQM>ir*;9dD&&
zuUT3Y*}HQpMpv^R7rTDHWQ%~{^$D+PEZtg3+mBDS-BqF15Z-nurRw}QC9n8Ezo1lm
z<wbdS(zSd#i0HrN?_Z#qXBqW-z?A)+`j(~LbF^<aSRKiZt=6{9v`umJEbBE@6Qg{S
zDK15kv*OkWRbfm{oI+5#Ee<BQ5&>pQp!Jvtyp|;MClmsF=BPHuTOEfqF0PFJ)44(b
z)uay7N%~@PF#neoG7SuzX{u#-chc`iq;f7#l?o+A(UAx8&tchpHiS-sTppi0lwxCu
zUYSKRkk}%YnQp(;E|-vq5l6I6hVIR3D|Wnd(ZyS-y-78z4sVULe&$&dmUz~Mu9e(T
zdx;fni(ZEM(%R$RHB~&20nrEu9i;jW?yD#(tgEYDa^Qs_7Fv3xE!qmRb=2jEgPz5P
zzfCGy0bB|?cMo{hsE1PP>sj2#0;go*)BGA@%n1Lisu94>h(usDu4#(RrzoFp$|F4&
zU?BpjgHK2O%I$m(!D!fKt9S5KMkug88hOu`=u#N-d*rh1xgDtckR&sr&c(2wa=HNj
zt8pbi5ds0Kg}0FrNH133{$*DD`X!KqzePY0VCsRZl7B`;G41^IPNg0VuKEN<PmcyF
zuu;&sWER2SY{4zXrN$n-!X7DKfPk!$rAI({kp!b(!_23;_3khDr;GRl;jDm*i#8Rq
z8`2acvmN0HLzmaf;@TgZL8wJ-m*Wf58YF@ZUU-<y*(G)9aR{+krrx_^2Xn_(+%EWd
zK-p-GfFnd-f2$O}o^<3|H))s{X??zc`+C*kxbK=m=zjHhNopSzGz?|(tnh`y7~Y6{
z^}0i`7I^*VSBNCPxurP@M@cjwq>v{*$@0eQwH73^3kkD+zNIEwoLSSVu9<WN#oRt|
zJh<~6{jR#Q;E9g6H)dk$?w>lj7|kU6;XNrWits&$D%I3EEaqxAFeXE1(y~L2vyV6K
zp8TUZ+-o1QX@e)ZOc3LBi;rv+LW7;r<)yph6V^4r(r~p@8?GL`zJ2V4amDkZDXi$l
zq2a#1PemnFSPd~1_zS%h2xPfnV%5joR|2$5jty?(-+9Q)>KTsq17q9AvP)Pe*cA**
z6{cKgq|-(RkU@|8S`I_%Cho_)yRFY^E?wQ!L$g_A0?z!<-Id{`$MWJa8zUqAUKq{U
z3OC)pk!OKueKiClPBZ@nSiLSs1);J}6O~zFF}tTH<tHSvFIPy@+1q7xLAd;hS8`wi
zKx-QE@fGd++;cY3-Z(<1!Q4oQKz}-4pYtC&GhW1YH7E=k{>_QPH`4ngzj4<Z*bPW<
zPbhs@%rh1EYaHDgOkvnrUIk*JE8744j}}WsmtZ)ufc&dt!|x~H4DiQy=cQchr-2;7
z&h~g6?i~5yl>k7w9_HN{-}Ye=(5QNh;lIVdmASaD!@}_15yKPLWs6TV4A--X+vpO2
zHtsSaaG!Shl~j;;#!IW!wTvcg5cgIwyH~bA{%bVC3aKMGa_dNd=(!Hc;7x_d5-aC~
z(9>;?A0-L==GR#{QpkAxppJHOa6)NmXb8+fNAfRP;Ms{<g>+|&ZIAwK9qugwQEusn
z*DpUP3c^Z!5KcO}cr`TMRuLx#QsOKE|Lziu86i>Z1uRk48ls=bO3`7vR2B+%`E*m7
zKHQn4&hrHpe49rFnybVD?-=?}P2TE&RUo1CYv$5h-OawN0(juZq_xt8!Bp;p74#YD
ziHiWE9*0L46XI8MM&8%|-l;>?pl1+7PnYr2))@W-=KSF?DV^Ew?x^}Qnt((ML9!h^
zFkHgLz|MmWN4YjwINiX%V|1$<AKrke4i%5DH1hJT@F>Bl+|Pdo;v1v*1)?E%7y|pB
z<adVbd2V-@@^T!72zPj&LY6JPh^^qAfAC;bZM^77VmBfI-br`0^^$GMC|NnLTAGzk
zv2lB8?9Fc<=uB=;d!k1d+<^>el$muT)8cauEE@i@`~oVVB-5q(eydEL)OyzWKTTDv
zS(oCv(i3R{)8zCH*Nl9pR))1dOJ0~v9cWJdDojOxTr|F8Fbce8KN;Ur8_;Q_ZHro0
zmlYKh*XTPe?fAEQl9V0jRQ$7;5c$r3GPAV`_29ndAl}g&bc|=-hKbq1)F97m|1Q@X
z<FW$-?M&ge?>R7!^FE+YY#@*rNh)TszO3jPbY{C>TW?NcV8A=LM5iz+3N&UG7Iv$P
zU2OXTE_HMe4VkS7C_O(T-Mu`)d$E}Ll#fbg#hl3-j*lRWVBb#D?u5n<OR_SAKT6*i
zdzy0b<=Ok~)1Q`>3kJT!-@g4*ROV8`v#+!IEAw?gYQlFxfO61@CEl-`bZu~TT&Z9C
zwf9suyI<^Vvg4!7Cm*B4UtXo;mI42)&c#6!MedO8GZv2~zhEkGO&EngY;ufBGGdVy
zXcxCKw~zzrmJAXXekf;UyzCr3Ni4ae6`HB54jQEF{yPlv9T2;NE2`DZMMG^NUvr9#
zGwjZv&a*Lmz_%r(eJ^^ycI6#-qDnyGamW4lgF&Gd*&u(vn<AYp2fLw{UN8%CU^$RJ
zzg<KBQP@rnTaIW)6T#(8`A^27cr6mbq58tkhF#TEb`qL}qN3r6HjGGNyny<k2I_TX
z*>sGK_Wqu2cQ=ej=g~Rn5uu0@GO;pDFARx=Biu3knY*T`Y4cFDZJd9QpI?Z-(>OxA
ziY6#KO(*gg!`H3%v+n<6%3A2jfP_S!cpL++zBXBZfXYRB-XLM=_k~@_$UuT~bZxQ;
z_=RRt0*W6xr}fQGbO7g$2hR&@m?|c|Ln^)(Gcap%Pw)12z|pgxMAng)&(N6!`%bvc
zO}<`rTb;2EN*>uB!=vp`6&uTK_dzC$HEO}B$rALj%p^=TyD@DvS`=!YS7ld&Ac)jF
zVX@38vSa@1-T3onqcw*%1vG!-RD!2Uv>^|c5!85OzA$`~h>l?oCwk_`rChT|fhw4v
zCt$=O7uS4W7+H~aN;S3Z8|q<@j&74-fS~zz3Mn2{rpn=5X<X8E<+D;o*i%Z01?;V&
z@q+l5HDvf|iO?IX=o&683)$@CfqnfapzFaz7`gMHl-~idVYf*Bc>?bV_ch><g1;&W
zH*A0VFG3*vN^B(_ry)67<t50}>4?+<r{&PS;en+O{~B5j4(i<E2Xu=m&0P>#c}1%h
zOpt3jxTd%y)YQO)o+dI(W(!3@1^>N1bqWDWBG=^cjMgmd^w2NXJq%^9YsmtPO}nH3
zs8SEZoOSDmPsYsot9u@%nQRybzWg7`Oii{af&_#L9!7doCwuQg@A{JAb-RORE`8Db
z_l5xUsFJvKEj|y;yea6sY+t?uo4FO9P^GAW+UjVNgDP>HhJ#y~qJxzXt577Wk}P@y
z0Wsx_>%M{oiJ~V8*+xbazVZ+#-Fe2G#7!Wnny<*CA65q<8gG-H7rDAxTB>h+D3;47
z4)t<(h%$?vniOIpND_vmZs5-oy9tuePtyE;D`5<+(k@Nou5@20OIr&Ig7<7l)0QEJ
z|G7Lho?8`wcBH(b<xkUSW*V?{!%IJ3wwZLuWTg08%vtx4u7I>#LXSaA4d`sra0!dD
zf!Bw$;x(LpHQyrRJ^VaScZJq#KEc9-HN#LTuh6WS-UPj9&7?2p!z@@n-PhheFx;6o
zhO94|9H)sM?Qj)D8=Ow~#n$BY{D$U-<)huhUjyAq<f|UUdG0HIi#%#hsaJ^?2wWZ<
zG8BN<%2;)+nRHsEvw?mRV96Ce51NqHU|UY<`A(6{ZO|HRD$)F#gDQBx5%frO4(6kE
z-V+-6?E;X&Iv2jHGb7uoQ-HQUBT>9vd*RFlR)KaHmhtr+GgIo*R061SI)mQZJlrb0
z{Fm8!avu{{C3=SYf*G;sS0EgjRiHX<sQkdvTp~C6`cQpyjk!Uoblm+|CryNEeQAvr
zeEb8aqpzoDMx<9$h|>x~JFArzlv|hc=$!zgsl|C7R_xv1ccD4Qf7gzWW^F1h%6%FN
zhaP9;zo^|grcuDp{t?G1yJ{zA%E)T*_|j9H=%xLcy*)AQVT-NcsY3L*2u0l3kD7Dz
zjO-39e$*7Ku_u%}2Q+|>uMWv|iyG+rMnF;y{~oOAcuPbn^hvVKJQ+XS+$(co!uJT+
zxj|-Mfly2Y+y}O0W8zY0XZl!klUCSr7<#gC85mog*>bBjI`H{Rj*mF~8A}|~Cj(dz
zN&0kaD>^ktgUs^KJTEFS3T_MSKraw%IJ->s;;%l1r`N9iDib~9J45cP`Kh^rx$-Y#
z*NvTC3HFV8YfTYWA<B}Sot=rYikxipm&?*Zgdg-5C=9tSzJ~7{Oi)0lN`>yQaG$}I
z(`-R0DyFY2EG-CEWpW-jjmazAc=4I7Lv#+KJG*M{eKKz_m6e3Qc%`MuT&s{0_A2aD
zYuo-0%y0R>J_j93P<BH2+&(yRohHuIDlm_zXj<5d0j7;Ix9OaJQUQ6fAu6rA;h=uG
zNM{R|YNwE?J(-p_-Q9vwkUwdbjq96z_6dN02*{9t0Kk>?HV=eE<{WJ<Q8%7`G7(Qg
z8sg}Ro2qbUaL612><@+oxW7u@3lgIAimLH2UI4`HrcN}U$(`#c9(9{)QiU?3R6Nkp
zlMlI6UopBq0Lh`=O0Rm8Z=77#L=)<Brj+y0oI7TkN>KT!%hoaMYIJt+w%TOUp0d#@
zTQ@D#2`jI%$eRG_5v*RJHiO1=kOiR2r}{bg1Nnu~mcEqM`jC1OFbP?)9zRPJXY3K$
z+-ls@J^6Rn!GCk3Lz;2;Xvb35s3SVqLP6$ihK?1F38eIH6?0Am1j2@4T)h5o<<Oqs
zLk@&A(x~Zao+O^Yu7A%s@2iQn5jSbSm7!g4J}5Hh=opREWZqdgB;DHCC43s~S}Z~^
z`a3k;(m#jMKrot*^EgAWAb9*Hx!1ouWr^@7DyK|$aJ*4m>`GL}E7YCL{zlIuB(kgI
zldi5rs~kkB)fDC)<DmzhVW_pOH^(Wmnf<z-p;q^NkOv(C)XiTGVEj7f>a+7yL=!7^
z1uqDVBfA*Xm>(Q&{&~j_{E!%<iRtI}hVdTZ(9>k6eB^WCIie(!z&Z>u+qkk%ma#1<
zT^@c`BlU}*c17*YsS;wsRCH`%Hxoh&z9FnD=0I1d_XQ6RM>e)R+|Go3mzj2``s(+n
zizq~3$G=ewA6lw#&KM3t=_W@D{A9C%zo(ISXtxOw9*Rq#=Ub-`7J!OAI$mswqu3A~
zz_#32#d?1hf;Q?8X^{T|b{9IEZ*aCe(SfSIPRxM5L#8B15M|bo#jYfD8;tI14|@ei
zG3yMcAboBcii*8vbeTXGXJ5{K@h1+zd$fA>vbsEY?M(ms1xTOGm*-|oxQk&MZkCoU
z%}s65e|(s1Z*)l!1jE@23~?7DG4WUz5Kf}!#Ul_bh@ZRiLZSl(*u+mhix;m*NvJ^u
zq0c|5UC)2L7|8CyFNtpwfiGzO^gaALV+vJ8V9HY6ET-P8b|D_?ec?{dof6OHet{}f
zHn5&M7(QM~(~Xmox^X<85K5qz>~BZ1URavN5z21@to7@<2%lVMXG)D^$o<-)ab$d%
zgankbt1Hnrug}^6xXuo=rA<5Vi@2q0G#auyQO@3JvxpQa{d4z(b{Vo+KJ~jXfVQx_
z8N)O)7>i6wRD@cO-}W9izfpFR^DJsBJ9N)ZHdCw6z;^Y?fBSveC&CR2&lEZ_&tTkC
zP%}B>B;%Dyvyy6fr}S7}wmYD>ARxrwb$`+iLac)^dGxj;&-k}O*?xWyf6obC?`)%^
z(i{|O_W<djb42w%2i+McwS2NUJ-$noQ_hV)+akVj!K+9fva-GJ{~tX+yILj<X4C#|
zh)3YVjn8n)#^7Ch*_N$ATYTTpzlysWVg<<_S?_~yv1E#F{+Y-aHO+F&n|qg)WeKT}
zmIjBNIe^9<^*;L01Klh%*xuaw$r?QJLtkT>F2~U@exYh9L%LiPVRlXIR9+C~T9x?w
z6ogi7L&rx~^2&U6h<lG*u_34G$@=AE1-^W}FX#^Pa`Wo1dWF9$tNRt>{L2MZt(yWp
z-}M0dEmdQTuc*K^{pGgHxP`@J(7%y#*vvrj$WhwB!O={`RN$Hk%$>;$0oqa~jwVi)
zmiMJwj&CEj0L^1$Ln>Q4^Y$-i>fA=ftuA}hR4&!?SMruHng(k`{qMju8r=>f()7DF
z6K|1F2q?RbvMs;_|8`An=xAejcCZY-mH?hE4OgVJ1)*o-ReNK_%UDvnh4k{^u{5^;
zNr7(Dq3v8(9%sIGYIxbH!H1M6B`Ym4Tc`lryX{t4%i9xueF33p2n2kKBzsO?6i|fV
zcL;V^5LZxTulURB<V(&}Erx{AykTf2@;UsX)*b|kb*`zK<ncaP-@1F8aPpz%EunBi
zAl^GJ=gLp}&S>?|PO8#SN@i3TY?f}Yi>Rj$h@gbs6#aI8fiKgU!+^2{QwyH*HTS(~
z5oMkDMH7C?Aq9Fx<yxnxcJdF|%bgzS_TC0;5$oWz&)iPEcnzuJI945Q!>g(^;YHpn
zy99yg!P!Z1w{76<NDk~?$oixrzHxV@{}<XfFz^yy?{2izy@q<3?1C=17Ci60Q^v$!
zMWh<c*gMLSYz=qk=diQj{A!xT*X_~jKkPjjm{+qK3421wPvHV}TAn24HW;M1rsskf
zV^@W9)bMfj6#qh%e^wzx5GP}+a@{VDY>ayeoQ9nc_OF{ntIBa#ON%VlgChYZEfWrr
z$$h=Ig3&()P1l+a{t6X{<7NezBbB&|^S}oPWo{P;4frKSS%@`7U2F1fMdRPqkWDW|
zDy7>2(YM;{=G8oQheN{R)i+2iogibf5)MD)Cq+Mb50PF_mhmg|(zjNCaIAV`CG<kd
zeo;;=0B?2vg%p$`90$79k{2IUx~7NRC(A--1m2^JS#q5oel+6f^nazN)>MV<QQC4D
z2b>{&xN`V?)5l6Z<UZ|SbX@nm*PVp+6dE*Q+tpWvlZO$!;Z7P__RQgtAo{2M(E1DH
ze(;pK2g`WbcOHaa$RSI6G`p<HQ1!V^VHPy?`EAAP=Ng3=7o46n6|d|YW)es6)51Ej
z!f&+Iz(Xj0R>D5n@@V-;;K$|T?WmLa&?C!(BA4-nr42-Rd+gp*!+4oIZ=)N%kMHm$
zB>%i8Rsz<-(Jga-kN(`pzG!qfovk(8BF-V%_cBvAKCyC5!mVtyuX`e-Wg^RXTzNb;
zw)h^taR{ez4hb4nH^4W2u<r-fyNC0-*_YRPD_JP548{HTA6tKkII<t@qZ1^i+474_
zD%&2-xWjNd1>lBN`4JS$%~U*i0%?_XLpuP)p(cPor52_<9T8@3(Q<}TU9{XF(egC)
z9%QI<<Xq%^&6hCD?NxBq>YKTbz{}*g3^V6PgsE8@b1prVeXG?~we+~jV(Ko&)OMv_
zL^y~xbM9P(ao_M?tLLui7q)6Y!V&MqaXJ<344tTsvu^z%p_`K_{%>Eaj{0;{+5&cW
z`irH@Ennz;o+x4M&RhJo;g?P|HP0q~DtC3OxA&_L)mH<k$W>N`>GXQ1Vpd30<6A8e
zySFch4AA9a&8?;p9_;<c+`S&Ws@6U75TYb%arizHQMZtmD+;Md7}zy3lGdK`c1cxp
z;)To1neHr5qP{wQ6Mb!bF1SICuVhFMerc(}_<nEUus@Ln|B6OC8t$MVxf6QD<G%}k
z9hOSgZ)z;jaoJt}ci2bOJveIPQHEN4qds`>0Q)OjA=ty%R*Rqi@73g`Znh~r5xX|k
z+^n7ZZjC1f!8j@Bn_7Y7wCX~1^_noz<qH#LG56+>8l{<4bZm{DlL62nfynIu+EN;@
z@blTVbz26XY&5LJ<QjIt$HlJ9aBonX5<B{aDLyrGgm`s|ux9nAgCl-L-tL#TDqSAh
zN_D-F4B&Kf$^-nkk=nJWf|cs;Q9u;Ez9vt3USoVIP>od(stIR=K%8%PStfhiz~c{u
zAk&AB&y{XZa+k(^A{}^69+mu}Z_O*G<ADfxKol@HPo_>fLeKaO)`SZ@!BnsYsIW71
zZR=lO`8^6@c83MsFcDfKzTh--zJVtL(6(HZT!}BE;J?w_H^w-zQ|uLF2o>b^t(SfX
zy#H>-qAwN!TOcY!xk3L4^m&SYU~!xbwc@3S_V`1l6n*H0uxR5hhRf^GrwFj6l8$Hy
zG**0p=wp7eM>B`5H;T*H^5IkT#ZJ3?n;%i-?M`50a~Xaj#BJw2jEJsxh308E<&T6h
zzn9XeYP|HNP&fPk=3ca#wSMZAHU|=8JJif}`ageG-aKJ#H4ol%*fQf{a7!v%Y4$(K
zx*v)B!Ku8qU%NVfXV&mqX&i#!>k{nm-zoD+EjA|)PojFV4O+Kc-nS2y>#NKFSsPt0
z(CL?(oVzkUgIN_M_`#=|xlegC2brqVb+i&a9)bNivT;Rrt}Ww1IjCL^M7_lie}&r$
zzIJK&YbXuS6pwUVGwRoXK2RkdOj|ITkTM^KUr9pqUIfcvAw@~_6zVbECEcq^Fc`M<
zlYSA90z;?ZpvO@Op3s$C>(o#tO*V5jZG|a8h1l5hQed3w?2yl^vUcKLtqjygUlIez
z0ZqQCEox-IM8~4PwFF<abcTqByWYuW^r)WTD>-kMYR0y6U@$J#0j$-^Q#CHJh;N&X
z;_8eBD@0G7a5{K|0EDQ-)QGYu5`eNlo)jIUKYKW0v)|v~R#sMyq*sfTVs<TCV6;sb
z9GO4dS<ZE13Wv~5Mn8G<-aoZ8RpS0_w=#PtULYY^@c&IOIFsZa;339WlHi^Cnx?6z
zqQn~oK`oiypv!yCuFT2Q0l<QNeV0Q{A|yli);%_k%3Jz2|9Q!t@`yp*>}Bz*lUvOw
z%lsw+D0~^aQ;k&OKmnytBKyIBF1$nZOtI}1AV|uq!3FgRIBF0K|7LK=@rSR==!1FD
z3__?hbfYrJ|8wY>h<CiY_n^h~KKzajwvp<|-wz%P>v1zS^2YtD_c)E$`#kwKz)2xw
zx5;pSY2(HDKg2e;DtiaiNer<L!sG~7H#caG_VAW>sI0e`O5K{PYdII`;VMRn#c}Z5
zHJ0Do^Rqk4z5X9+K-I%xVd%d67yQPcGF;Ml!O3D_G%d|#^!HS->vGq_2xL6PG06X7
zWD1JsRHF>JaQM@b9Fw`swMr=jjA1@JEZ<2(%;t>}HN9Y^zF;NLu7m-0f#ucOzVS80
zi=yKCm2qo0R4KMINH$Y&F^)S)7~~rPXgoG{&kw^Y%Gwd8XZ)al;z@7`;CzY7Sm7(B
z$}=Wq(3uiyb}wQ2YS+cGv$96VuexX@3Fi%QKz?r9{8vBqbO!)58>RxMW!U(tOnI+*
zG@6#?*TnA4?eAY3NSfcAkGgz}eyi1T^3VJ&)s!E_lCj_=`;>^naSz4TuCgpjl)23c
z50QIw*xFlVeKA=`#4nQ-UN}dd7ktjXDiDNCTy`TClck#f3-&%(z7yXpxc`nR#fq^)
zF96oj*GJoMhsUYqRaSI)nXdtV{Apm<)f;`1|GKJ622=|B!5Q_2Yga~mTE>C(+@j}J
z2b<Ct2G`PCBvD%=*f7@UP(WGemsyBH*%AeQ*GzDKo`$%So~*`scbPhi;}c;gqsJ%}
zsLWZU3XI-CQO=Q8c^2JAJMUTWe)N~Z4QW02^_6hZB<~1V1qre?fKqgr@UO@N<y;5N
z(xK*XSt^FnA^J!r)4FsuYLe(&3l)kj^*%xpsjB$|{ayw|C~NroZ-PQHXQnsVpZdIN
z(r*p%_w-PmyjnCWz~YV>=<I_CvOz@tiMP)w3E+RN?h=NSdTwgwp&yBJdYgL(XWK9B
zg}-9TxgGgT4appyh~i;H*#7gwC_%1PiG?}VfTE1+r37DyZM$gHKj-pQmS>oVg2p<i
z67&$cbtFKi;*?(FazN1yogvbYEa-?<RE&g3vgec!{50Z46$@mgDK2?BV|NWrB}9>*
z3qu2jc-f+s-*j1j9&-a`WS#Ag7fZ=VXfB|(+KmkshahopiprW^(`9?~mR1>a^Sf)^
zH-7-l8Lg{>DE4abMn+qezBUvY@FeYw0~nr}5I&|-EBWyw&N^vuOyw(H_5w9VpX*ID
zP>1UsY*>~Q5B_zel~a<4UO2m+83Tldk-Ob5AYAYhK6*!c6f`D7tauBdZAGQ|u+|!n
z9`#N%6#Ma94h#L9{NR%?5=w0t-J67dNgoYk!xA)&YElL}tEYLD7dr>#?>>z;QfBW`
zs*vN|_niFxU)jn*4imqyX}j1`#%XK{m#Wzu8J=%6>P0dmJZ(ir5MjM=UF_z<6XUtD
z)or`1eAD#nLc0yxsVay(rB>vOWW~wG+DR&6xpef&V*EW4Y^zKw&tBo2%-X@s%!X59
zd*B1#i2LuBjc&1+5`(Y;344z*xk|#~xl132-`K-i$B^%)norjE_xq=gCPI@JI{)mS
z_q$;iW%36B>#zbB8nA~ndy<3pHbGmiJ~4l&voUD*?`_3KS2>5EhtecD4J(d95<2{)
zv21*v=$Xc3!Yfgh?^4{q6y_TL%7$%>*8xX>e_ly>`d?b1E-BqZ?~(aU0{e&uZS#<K
z9oZi<5ppu~;Iex1*P{uBD~`Z3nEED37)QgLnv}mQe-1lms=yOBi@rg#Y5MI_&s7?C
zTcj1r&SMoM`@;g{JHUM)@b?QnL%@mC<fM&tyR|5#ur%@F;Vr?pGqoeoypbVws<C@g
z@f6yPm%b$Uuy>q6ku0xRCs?TevNG{MYj-q|HZBr{cISnwMOU@Ty{rDXT;Kv+wyg|W
zFSiv49a8<Uu!e~llM(H4TL3tX;8>YB4GHwD`S-UWfh)ET9TOg=7oM^I*%Snl42*6U
z+8RA5O`%6P#yS1UhU-p-(v-v=-p>5A;3rGeF&eLSGS;GUVT%mROH%<{^7A-fy{qF3
zMi}7SVqe1dg@M6jsZjUss0)ZLy579W<uXckX)wF_+OP`5#7NwoEJF0pc~84F-L}vO
zE?Ewe?e7)*c7BTO_e0!=R9p`t-sWRUVmL|%X<A+f*vh~e%M$J=l`dY!IVvq3od`71
zI};4kDD33_eYV>HAFJz$60SET1i&V5R?A1BQ`vGy^Xpt>B=Oi0JSGjah?jjDIw-^U
zj7BFJqQz9yxZ_YFL{m0`XV)9ri53$tB<zPbfDdqFKK~7;pjns^DTB9ue+}Gya8J1M
zCjwtmLmH!<8+)*L6HoGoLS9H+SEoE<ggnjIamTRQK-;bEQ};9<`bJ`&dp_+5Oh5|a
z@z(He6r&@2|0JTAn>f==6~$0lY@f#QmE<rgV<8ZnGm6D>{u@M=4YH0~qAgsY_sw!A
zt)BvDpg}W%7sh66%DMuQsdoP0K0XZS$kfI=JoFm=irf_OxkuQaj-|fi6C3VlunPRI
zJ%H7vf_!Eua5`ByKKFxYpSU&xuis-07ueDxUdu0y7@wF~+BoIM%Luw*)?V=KMOT*Z
z$*`4-7|TVnUom0rSgIH%qjgbOAfIgej6ZSiqA=%s>2rdw{i9B=k|+gDo3%eP?uO(q
z9vWhkC8@+u<15#@J{#g<yx=y}xipDg<Dv<BT~W4n0Tv;O-e{kMC4Yjq5MHl0zW1w*
z0)=P|4f8d7TehO9>j!(R8IBZuqX-r4!f8%8+p=Kai}wyW;$h!$KH8zEjDG3`k|S(A
z@m6Ai^7v`Oj=4dmj$SJRE<8-PAX!-ZSo>Bq`*WysUUq6<H#M!_Ug-3YcN2;gn__R;
z`WAJXzE$iTS{-)Bd%Pxsd|DA6MTduV5d~YJt<WEZO^bd+5yGj=#Y}0Zbb5)n9lCr*
zp0N?Nm|R|eCQFqm>Dwxzntgh&#9W{vqoXvb{`%VS%WMs+AypQUKtQ7_{2@A^ytA-f
zyyW%DPA2cs7N44ec1N>IZ84YpR%|Cht4rJbXlA!XA|{FUt}G+x@_Q*lg#Zf_&N@C~
zUFR~M%8ucW>kwGZH!Wk+cea&+==O%wrV!ntFa5GQ#bY{z6Z(Mn$s1!FwXN^%owY!W
zA_;N6pvAuZblzr>-4|j7SVd69>7mCMCK97{qOj9>$;a~}BT_o&KMqZ#c~7ufE0&BZ
z@yaB;KHvJ*dhxlZk|GUr7WB(N0G0LaoyqdLH`CkCY~v()Z46U6gb=E{?vd70UXw33
zgv*Qn+U?CD_;kB8Q=jQTDsnGF+JI7HjpR!h76&e5?TI;6I>^&>Gf?5AS@|Z?t8U!Z
zc2kc|Z0P{;{&~%l8Z!XWsYY=D;Vz9!kK>|hR|nu$Q-8lbDyWvkH3w`jcSx`0iI%OH
zu>_BL<HV-_G&{lVs+Ll$4*zX_{w>b!kZhB}I1_Z|?KhE9$DMB5sh1ry)nkBki0m@Z
zi4c7T!7C3|p9}wK##PAvg(e&`6-=|2x(1_%xAnz#Zg<+jBuCiBxA*;i^RqGKSKldc
zI%rOdNSuriATV9UFIO9o5m7y0-U(?jP@|)E=b`K#`(nf&6puU$md$*xPkJt@llSJ1
zOWh+kKn&$z!5Vw>;SsHKhrjY+xSqb^;?5v)?MQa$znKQh;A9bx;c7yqly)4~5%TFy
z)Y!H*0C<=}=xh&JXlHYw)>@+T5Gz9VWy_G`d1}Y8UvThI{{fr$wD_4^v_9~8u&bCk
zubHQJ9ZVemrCrtf>WInGcF^P4H7|VaT3@yrReO1-@ItuXos2pEGmTu>Xr&+8foR@+
zB^#}l@OFm3!t10!NYm~wI2)vW-Ff#6Wb;9<gd&vN_E&*N7Z!>{;I2C~)vzMGM8t>#
z7i|fjII(z`Fy7>BBU5x3nnj}grn;SnEGY1^9sWU<>54#VwIhG1U2@*=cZ!7MOF_|W
zg4nN?luHF`0l1H-ogf=M9CO|v131PH1`6b3p@NLuv8eWll#W5==mCV<%J02tPm<~N
zOd64Wt?|WVU;^y@n}BJ$uwU@}+ig<!Y*yrLgD~7o)<EN4uJ-`mX*kuIgS&Vj*1%aF
z3%4zK4MtSx<&H3QO?dZ4-Q|`XybKlD2G1#QO0QLT`0fRVQ_`IO#@0XK(;sU-u9P`F
z{IRVBpZ)sg{;d4se%YEV?mp~WxBy4+1Ebfqk>7;}Dh1Ans=kWN+|An%?u&gOoi@Dm
z$M3hy6_Gs2flA?*y54rxG0BTNBR5_|3z%HK({R*$Nf?2S_fO^Eugagi<Pu|pd8><>
zA1K~VJD}F{O*RHzexb+Psn=S~VLl$XwqjtLIZU>Wj?v>*eWl)GTDU){)Cxa`rU5_`
z(HGh-{xVMh3XiAfcV}(9T<=q3c5+CQ7~TK_eXfSVnD21qw>}Tpsqdm|;)pM%fgopE
zI6Pz0RAQWQ)JB(m<tq@RBbENJsY4p?3&zj-Ql#HZ*OtyBiLBkXSf>KZ<Icz&t*(W5
z2BR1z`Pnk4-C&C;NKu3?*Pp?X7*bV;p1dBDb=7|Zn5=kIaVxQ7DSEl@n?E<!H~n`}
zf7{&PG0M(R#5z^YE?q%hzQ)D*qvazB`{A!2#M{HXv`0<ku65I0CC(<IVC!RE#4Aoy
z+t~`Czt((M!<OEIrdRy*>XV~^mXpz=zi;gJwnrQdANwn4#(bSfYuNWcI@)b%3J$Gt
z{-GOstk;KNeH(>e4}*oXpP>*{hn}Jw^qxK4pSRzmNmwo)*;fU<GFOLQ*0z_LR(%!M
z#mxqf?L)?h<?s6LMs?i0mn8KPEFgN$+UH+zapjG=+3Urm>Gl;K+&=d`tPq36`9uXX
z(cLi%T<Zp~T7ROu+Q&Oj>LxB1Kl$$wuop}1PD*530&pkoW=wL%N}n)m`q7=(j)bWe
z28cWc!xr(@{87`<!g%e9&Vk;jp+7OMVAux3v#)gh6x|mnI~HExaI82g9V~&yD#0tz
zTUhVXEfuOMXOzddH?8xx1{nErfk`ca{5iJxRKfeAc=DT(Wtr(2)A(yqA+r1kd_(Qn
zs;_ZI&RRx&Dt5=^WNS+*nvTvdUhbml&hmc2w@RHD>d1m0`b<udfsxVnl2<r9-}E%|
zkeSg4qQC0@7O#nhT}FyEj6BvkpDQlOkDLzf`TXih-6S7d!;w$m&+>d8h;S3kd%~cd
zn)bl%l{2NNNRJf=huhW?O|NkQgnD}@x|`*?Gia?F?yGbO&|;8xTnUB@&{f_VMZpsa
zRkPUiHN-M%0|NrSqzG$1^Y8#1hu$B-a@FSMm&+x%;Lnlq-N_#Z8|Hl?Iz;<o#Rt;M
zhP#ji<|_~gW%Vu7Svr$)>ANSJAIpAWK*3&5ECy%u4GL+6YeRh|pWF~@8{b}DTH>}n
zJU%*DdK!bl5aSQ}NpP;np@1s$wcR5DMEI`{wFAs;F5?fP@(g<!G(?pFTDs8}*e;*L
zu()}mfqtO7f=cr<rmz1YgE%QRz@>?W;+WSh?%5%}Km#D61=O*7Vi!U1sKm((#gZip
zYBDi>l|vs06vu{f`MY<yZIL2MFudNIg8w{8attKdUIEdV**_E0)`PxB$DiFY$d0SL
z0{8+k`LueZ8r}n3;vJAtcWwWm+N@#jU9}Ct5{w8&dPZ0c*?&3@8I$V{k1Q&}gsl_x
z=`0Inw~Xq-(YDZBB(B<6dsuci9L0%+Vc`@!4EJ@(>qERc-UG&H_Au`T;%DHyIQrA@
zJSG_pu|jwtQ%b7-HD`h$R&%~&;;)Y#YX2|Acd{eoK5l9F;~lafzO<%qzW0f?D5{Id
zZN`89<NIAsfx{U}U9hz1xk@SSxJ^ObqN36a<7>%Zn7jV3>G<=694;ugIvS1qT1K+u
z4;>-z;~ET%G~;r*TAvXLssQag`Rh*^MW;_=f8rvdZ1bMhjPVJ)uN$RhtUMIMr^a0Q
zWVBpeo+Kp`5uAs%C!R&nlMo0JH{_8b11B;XsRoWh@(K!}$D7=?LBaots;l;hwGBRr
zMRvjBdN=zs9z5WpZ9f|!_je~ukrvV-I1VTwdz+g;M(jZH#*etxrRlTd`fR-Xy;0c%
z?DTAGx{QL<M?9TeX|WrQDp044Wk0l|UDz(}rS)PmGU()QKe+R7ck0~J5#dVSm@&y-
zwh#SKRAg2rOWi)%KG_T<eRn)Mlw^^#`n}Y#QE|O@C={4^a>VcB?3{GC$4lyzFGsPz
zfc$V#XFh|@XH?VM+TSkrb+fqY#7J~59_C!tEnJviV!z-O?9Cl%ohmZ<AGN#tWGmrh
z@9)eE(6@ZoCc8ZzaRSCkqtwX9viP_=_IDco^}B^unS7(ex;0N6rk!kB9v7bMOC*2g
z&_CWkipp_8`j|@)qD)P(5Z|V!1a^p!nG0KYgM@^+oS$>9D9G-z2)cxQYIaSZ2pMAw
zenY5y%F(SkE^Q6h61gzV7a7ft?Qm~?(-h+Qy1=T%{6=;Q5=(Eh+)-q?n<+U-lOoZC
zulIq+lpT2{XVgs0PITTWJx)ysuL*N^xS{m<(*Ds7l4WAMvqa;9z(L>m-r<5x)#NQ4
zyA9AkqoUI0pEQ{c6jZUTO={cI>|t^Z+R1nUoA+w`SDbL%F6pQJP9}UWt2wbUkBu4y
zqOD2Hg{rB4cN+?+OocN~LtYj9_g!Dpzvw!XIQG8NMfX%;=C=0_%ne3t65&<imS+%z
zBwX}%z-ne+6^2DKsS{3V6fFU$&clyI%dS6AEZL(g+xAo56Ud%rLv`za{oP|rUb!hG
zhvbdNjb`Q0F4~yzJwd{ZpZ<F%0`uusAxD2$N%n)4G_G){l6%Wksj+jZQM$ce($%lg
zp9W3tLOa9VF&Uc9<4PK(fa-&XmkfiIl~dJ%14K(!{9p_7KTO|Ec#iyt;22cub=Sxf
zzb7*M)k~6|wCV$vA>cr}do-UmVD~mbJ@2dI7icGbyCY<;aDKxpUFRz4zhEQ2mGmg!
z5gGw7n8+C-M>JfB(K)}-GZDLXwB)tF)hCexe&eD};N341bJOVczW|bUHxv;@9K!Xe
zy`1oQIO6?^>XG8dR8;>YJ@S+ito3LH6zdfiodRQ8L)rLhf;Id+w%WUwC=<>q$i}V$
zyX4%&1gBK-U&!&~J`+gItb*Ex3*e#tYXTTcX9OX!O6L-v8oP+e_0qTk_$B8|6rJtH
zgJS4AxO2W3B6dtl!t>*gi6e9K+pu=Dj#wuIQdr8%2ttu85rcAP+0REj%^>-+C0>YF
zJfKt5fX=6E2&{)TaaNL8Z6IW80(&^sCEqjE3Rsfik!{C6w%g^R#cI2r<P9_}B3Y7!
zpTmId9lQ86aew9Chte7%3cD_&Ln#?n%M>$)36hrf_O+vxuICe&q4_D>!Je{N&^A4e
z%u_lrh=chQTSCGBFcg18sR7G91Y-5(k%hRco23$C=<yPjV(mashCX}7%6lGaJwaAm
z5g?##J<gjt*Nz9jEd-o@z!wXi`jDf0K3=aE?lZ}kp#D)xWZmXY;O;_~nI~pj$7#(x
zrL*%f3$u*t?%ks$bvbYykctCVPzl#E8He%yg*07XWrx%Gr-ybcrO9+*R9a_$zw?ce
z&80Cp<yQk$T3bz?S+r?Gqb~RS+qU2Q-AZ3TB-QYaYnN*|!-TaDT+#e&X4m2r-%>+q
z8J2G3;v=8Jnone^e?V}c0mpymv6q@Yn^=Xf)JH<OOi^APuAxuU$+8isGxQ2Px}fGu
zJ#WLhWhtupW5&jN@ff|VGy?FZR==W+0j3uJv*x>hGWO@<=q4v`cIkK(CRdP&p^BQk
z0Zi%j{lik4l;X}S;qG-4E@Wrtr*FA9J}Jc@W73B;uLXPfYra}5%H!u@QBnk&JtLS*
zNQ-L<A#bz`op>1Xc2{bW=;^th735XOj`#y%$hRR_Z=w5WCiM8^XtcYK+8Ko68F>HQ
zgPysQI8*0lfg;5|!@MOmpB&r581*$*6j5)@n+_K2eE-c9W>5Wo6#8%dxt<~4?)cFV
zme*mduW!>G9TyXHvi{pU^Nws^;pl^MON+n3Jh~#u0)*v)wO_Ta37nsEdkE@geMS1J
zrpHky>vhQ+bAG?eEwZjwZTPnA`+0jth=v4!@%V0V!Ox?p&=Y6po|fYc_2^mg1W1kt
ztXQcmRf3#8=GBEBQRB`lWw4W(#r#^`X_z268qI-^aeMYzS<O=YPSoAw>(4_1WKJu!
zc0euwFX|^bL>v<+B^S}2-43Yx^(R79C;6x8l;Oqj3h{f?iIBR$fDj)1;=%FG<~7_b
zj-`GDb_Mdhz9^$8Yv3k5zfx?E*KZ$fqs+)=dHw{md!+yB^m>#_18oV5ozI&1-+%tE
zo?o|%Q<31F^`6x)j9pQ{iik4HyjH5+>fibgxz$bC+1Sf}k`K#WV=5h)AL)$RT}u{b
zgh8fXbGS;j!>LY~Yf0!hIxn*0ZLbs$Qr>XLZsFZy=Lfzi_pi*)R760U0%$h}&N{aV
zj6AEESl!V~?Wb1Jnhr*`FXsF<ZI~!ch$3@0Sd>>MS@%>|nq9(IdrvZi3H-pRTEnk0
zcQIaghT&Ho?ebt0C`fU3d65XGblS8<toxzGEmJeL)6Wy<=%=65O^tX<q}))?+p}TE
zKOgxe>Xaf$6oNL@!LfNM@clF@S9s4ZXo;NN%w3fQN3a1FI&CrK#9D2aluB~7cC0{1
z@PVwvGswF{L7gf+8GU`1JL3i;qIPsDW)DI5O2diLrik2bRgRVq!)oT&Firm!r*EZO
zWT&UIrv5k;-eXc6cl;|C3*EbQ;|l|oxU(OW${wDF)K~G&^dw;+&@LczFaxM6ue-vv
zUNF|pLLd&+TO@Z9i5FWMH^k%TR+^vkuH3hfEHh3A8L*7H#g8V*bTQH|Hm)`N<=w?X
zO&v--5mcyk=?zG$<NWjMF+gM!1DzBrOjKU*X+*JCl6QhzEZlk}6DXfo_HQXrN*Z6l
zAfB#hh%?{1G0SO0N&y(+x_A)6kcCGtcKDIu!@|#@yGKYC20L(y71k8MJABBjgFvE~
z!(mb`cGcjsHTCfJe-8UrDHW5~aBwic*$MCCrYd>F_c1A{h|2J@f+2QndwT@AHNHwV
zy~o@JKWp8@WIi0<8*WuS1zejG-skDP{5`h&58RTFKi62FEmn$}X%)ig8O(t`l==k0
zO=+0Ok=cgNWQ_l#QKJqD!9S9}^4OH*5UVPCgIh?G2-5;pGZawseJkd*es>4I@|*V{
zj2k?v*Grl@$9$2BxpzTk>)>erFKyz8VgTy~cN-A%JNP#<UQ`x(xS<|7B-;0GcxAo{
zmZUD0`we|Su{;K*6ZjSVqHTLm%xze!6s}TofOa^tGFmmJh4cQo%529)Q=K|z%1iGk
zcJ_OtR?;`w(9qC2GkIIai4T6X1s_zlBD8TR*uXzeCf|It2_rGyhD|8B4MVfEz2~*G
zc*JCl%uFa5V$20Spb(}P_mf{Tw3d&qP&OP1V=k$;MMwuo7H$WdMNvfZkZ9cOpI}Wf
zmtXr!$8^?juDk@slt?5F#u(Mnyb@Ks^5Xt%cQ*JR|9LODHW|u*cvSWPmZQYVv3ISN
zF;^n{FL_-4tY7fY%O&!+T?$i_;fjYVQ%C)@BXdAWgTg5_^Z4?esHu~UsI(EEJK3F@
zI_`;GKg}Md<kh>Tt<>D9JbyGh8-U5U;ac8gVEV1gZl@8m)ig~sXvcH(_6jGQealnM
zG!n@2ew!apSyBzd|MAf-ElJTw%>JCqg-m*{GDFTIyZj*n9Lj-3IuJjI^*Ael<`0yN
zI3>B0Y<PT?%lBO|nI^Y2N=M$cqcgRrgBR93crUCS&UghJK9k5_Kn%pt+cpOHqq=@S
zvb6J`^bH<1B%o|pkKG|%>}t24slo!Uzu*?N$=mRf%yj46(q74=DLV7v>8@{r+n=3`
zAl}499AZ9|Z@STa_t@j8a`_kQ5hFogHC@db%3cjMwGF$a$>DV*@cw4s$4|?4HAa==
zCDKM$U;og@;XJ%CygPDY5oi{GE~uPa-n5a;)E3DnSN<x`sNE!y?5m`-j3snIV0$Zj
ztp&riC#o$8X<O~j7_$+XTVLu+8~(%-S74_=`u<&LAOfZ|z>p}+O?@@8zp(tQCg6B)
z_^PuDToSk8g}KoFv@R-#D`impUPM?z%p<3XYQFVWs~<c@qct^UmxXPuEmF!Yr&8=b
zO<ndGUq!NLrrMcuioB(B9Mmzr)TM{ImWNb<$U47H#1*sBUt$D5>O?XSFpLU^gPMCb
zOUkZO9;FtN)608rGBW2w3kD;$pIpKB&}L^hB_*D@#3i=#6y{vP`$OK}>6L}Bhp{S7
zgZsckrjlC@cv<YA|8$tK+CvA1X|a6xtu>6uPZ@_e8ow15)<p)DlDiO-;*xDLoXUF|
zX8Wg6Zs3MP9?F5`f(_v`$fe5I67;p)&v!y{hnE~neMk&zrMCgG4WowEVt~#JvK<YA
z{xlWij23e0%PO)A8>V8==|*V{m3_VOx1rg?HL!XKlDKV4TrTDR@TW5wskzMrK~kT>
z0lOmT-~}ukrYWXdxE$1D>O4@$1!)B@ntTG?S`FEu-?K+1|B%;1WwH3%Dcc~OO0*lD
zBCK8bWjk_~D1;Y)m<fflTA|rW8DKWiNj51WC`PNMu`hb?7i~Tbr7N=9$wQ93>lIw>
zJS)$E1wv~UZHx<C+i#ZcB0hN6#3A!djgoZhcxFN_&pS(<fO1}?4fh7Li}K+==yo5u
zo@&Z`O=(_*(8Fx)%s)yYwtPcrYjfjn#uB@QMf*X!|Cat*H$6-c$y;kaarZu|hbKUl
z_0O4)(?#JES4QmX#zlve7Y}j*{T%y(9^V6#A1I-2a5ZtOojeGu3LCEH2PI>0<&%lf
zgU~8ggjQME%^E$9w!OzsOZ|fa7MFQs(=O&meg3;|-*5mbpNsZ6P6xpQN=t5&;Jce!
zl9}-*Gc9}IubfJwfCRSewD@!i3cIr7hyEiV0@Gs(+qF{2C~*2+cUvipJ!PKRorfoJ
z;~V#IopPSAa?itq#!0+2@}_6IW>&l`!mrh>yu4{xdFql&)?a=)6+I4Sl)IX`9lJ_S
z&BDfU8$-pL0luwPo`hHM_ny-g<1smRGvi^cjiRq5^`GQvk9rfL$}J}D0M)FNt^4Rn
z&6$X3rspUhN$z0UsyZat8@;34XF5wLTfTsZK7aV{-{-Ma`Sz?&OP!rlCkLTikwM#A
zKS!_rIT|h;fQNNHeU<Qk#1NhwJjK?bh?+?ORirJ4|0wn?Ah6jf+SfaKFm=2?+Op@u
zqi(j<<CJHZq*Lud{x#t$7oByhouO+{b+mS6`-~^Qohsb!>xeU}ZUDM=^%YZz!Mhbk
zw<qN_yVJ8a3fJyAm(mnP8^-vny}O4$Dj3oFUR0$En8f!Y=ZJtp>B28}<Mn!!avc8y
zqU2f_)8gM_>!z_wOtl%MDVq<bLNYGyQ3tZu*4CP~J$QG6FRRY@N$Mm?_0wvXx+oqz
zDp*Z*$bEgezVJcEhaCR!PMu3yg8?-ki#h;Wr*QZyrK4kW-oIheN=1xKG;d-sD?Tb0
zesTW8W!X}1UNy|L5*z|$y22#Zl<H!@08yuC!Z#@&*AAN3%<%4<)TFP%FYPI)=kF%h
zN@Bu0h2wjrrRxZ%u;~p0*{t2Av;e7VB2}7M1t7$_clU&Pvbnx)a}FOS$=~TAUftXh
z3fK-GMTVC~=3VsSMeT5XN?T-^9K$Ie^I;HUq5@6#u7!mKo3a~$v+^q5aXzN8q`1K8
z(phnp!D~(_03fOMa_AS26>*0js4y3W`0tm3;qNhieqQvtnm!AJ-nMDc%;gcROY^*?
zxp{$U4Dvhfo){C#B#&+#BsddcHZ5a<hquq3W6vcTkVGG=IXh&^Q7bab$AM)8g)1D^
z<WMCy<g67JeAh4VNU}SX^QVzEY$n6jY8r6rT@{%Dc0ZE&3OEYLZ~f4xmiqFeAp&8G
ze>5s4GncdnWtiv$^jhf{&7C3ZFvUeW#X)~x`&wLnX%*C(B7ap_^8)T;m9f(cfpEwV
zZ=rmSXJIhn1~=RW0zlWE;9SrJr$@64{;#4lk7xRi<M`JR6ERYagozwEwsOppn@o-@
z{E$e?mHR$&#T;c+a{bH^LOF96LS%A{%n@>@VJ*i(?%((4Zyr26zI%T^pZELqem+B-
zD_E${#JB+O88`6oHq%^HW=y#5EeyDQS5;ZA@GOso={aP+t{xk<zFef1XQkENC$?dC
zjWZg7wQ(y-8-`u<AZ&3n8AxxDnILU(N&;W?LAaHE>Md@#wl_2;NFKth`-GnvE~LjR
zp^M2+3mo$yr%2%;fHO!by11NFPeKLhXIRl7U69?|^<JxjfOqx#@4rp&+CQR#L<Dzb
z$;|K5(?K3F$T36fMK&LO*-20-z#7FajL`oW(0urqAxbD%d0hvF?~qYygc>y!?uKv%
z!*t*+T2O-d<W!Rbo<Bq%GVR`&>SKa=k>ScOvDBY3=Uv_*1qz9}d;A3@-Xd$zzv0ow
zcg9O}R5CM`ey_}LY|$6M<oM;ZQX;pG>|Vsyfmp!t>Z-50{J!1q?dBy)hRhH8kbT>S
zR+xl!s-Us#g94gzSv<Am#@d&zfi0gl4R@6x9gZKqjb2S)HnD4BX~4}ZsW#C3ER(av
z-6~7k&v!~Xl2er(Qbr-UPE5%U%Iqw6^1$Zp-#>41hy(JM7$IiK@;@nvlY6a5OGbh<
z1h&yUy4ZJ^(V8~n05O1(#>YANu+mD=pRH;&JRuSkMM<bXHf;A#NaKVL3DngMZj7Sl
z4F3#1dq(cua7X4CwT3W$*UcVo3?8qlr50cfy~g{VgW6VEu0j)4_(!I&N8g0F<G;H2
z2Lzm9`!2J!x*3{!W@<WF&y0VF2=c7UIHf<A_m_jlFRdVlR3}-Pr&#Owi*%-6^}4B|
z+<mTS!H<!92OlQr(l=cH)yW#^PJb?1HyNyyIWzxCfS;!$@x=am84pJ2uB~iwEhgO=
zlJ)z2M$WrlXh*HTFY5pFG(|jgXYUdO)NjAUGUMb}H4fKxu<2f2_Il>3?qv=xsp(NC
zTn%!%bMJn)m9Tb~3kwMiwee)5HJOYUAGb^gPQNOW4YJ^`Q9B7MmKlU}&0N#2uD3Ke
z^=N;cQET1-tJM%X-sRB<i?CvP2H(W=&$66?f=%*CiMx_goLQAP`1z!#;sc1!56H-E
zeQEt3#^%z|k??8X0KcNT753`PzE01!*<Wp8yRr|u1#(S>)fU6qUt?o-kIx0z4ZIMy
z4h^_q^IDgg!{TAF4FPd}>e$QeX6;>H)gQikL8ckNEr8EJ0Nqi~fM<1vC1r%2<Z0Wv
zM!l_H`qkzoRbb;I-Sf^!+E}kDl=W!u;C_m;(d9u#DR>Ip-u$__{gaJZQ>xBJ<uj*R
zn$vBe8jP_0pmS+?9iLV%KI8{-b8m~`W6y2q^Tzv&C2eVoc<0%>=E0rgJy3!Y_T7au
zGtYq2KB&9hZShOuMK4f)6S;qha3$(3vcaRQ_-*W*p+FQuf=fpmYKavjAV?4j3eqWR
zZip9Nc3qRrC0mlBIyd~wX($kZ?h0PMqB`gQHhH97N#KjBP;3yHOk`|LO{wtg%qGX5
z1`<TMF7Rv1NQ>%HR<@RwEMW1`JludsTK(h>j`c--x`WNR(Rpve!u#`~3V+TyUp9Ih
zzzt`9QI8$dslp32wVrPh$JEH)9R9-Wk-D1YD|7&In9P?vBh@S<cZsK2ZW&!*);9oC
z3g4JmnBXuQ_Hw03muQZTTLRax`eSH72oBS9t2jY6z9fJI#}E2xj{JHgm^2>gWb`M=
zxMvbZwf(_PxKaf2uvl^0t|h4{CvgOX;9F^fSM?+tcKADz4K8wOM>Ihegs#)Re@S;O
zO5+3PGH`tB4J2t%+NB~5%RGm;)N3=4hBeeTg9TG2KD!jwvq1$aiukP4jU;ayNkzSJ
z6)Zee^qF9yL)Gu-|74aOi_nIAa(E8aeunIodZ%fapLz+eXiSU3;ddXOi-p5k;h$rF
z#o3*|&;{4wcW}w^Aw9(9)RI*z>PeqfFQ2I@#kwKw`>D06pWK(@D_07ou8_cSp$?Fi
zLZ}8K&`A9$<l5yT@j79@H)Db{0uoe_5LQByf%eQX<44~wyiPs)3M4u?%(YL5pZs4r
zmSbygcU8Ofdc^+b)=hB!&#|mI$L?t1>Q>|K`{)tT)Hz<qETYx#%i-U}Xa9`<Q#(;-
zloAPsIYDlVJhu)PvT&xEN8|u;rGqO#PQq(%Z=QDUtq}LZmz}U^Cu?vrvM;(W+;*_}
z*IR^iQzN(G?#Ee=zD1u8O-74~kdOD<_W!I<8r~G0@X6IPJKJ5owws!=rV-K7WP7bF
z@wSiJ1@LD}@q2qKw-t%DwhB4QOWVKKn~t}G?MJ)XV?}gVnuEh59z&U-b;gvP`WSL%
z>TW-#H6&thiJDhOYRy0j0=FC~)+xrYj4!AX2O&yW3YL(UD;%E>N~xp+8mypCi?C3%
z8yEHPElOM0iSf4k4f0Czq2fsq5tW#cyBfjYrfuzW6r3DKTh-dU6FX2#y{u+psgv%?
zW3UIB?P1aH#sncIP}CVoqg(uJ`6Avx>Mh67nd@QKgcEBS;6lRrpS!u|MD^RIXY)*^
zX2aj-yf&+z_7uq@AjjrQN=gb=A5&UXW0~x7<di-W*vghxH#hG$WeS3I$TbU`(!JR6
z;3suegIl8N_r4A^U!E<YB_@8^X_%U(d0zY1NC3{ya?hQT`dsr_C6y7(XxYs;-WaEw
zw1(`I$lly<sMS$fuUna<4$!KOw;HJ#hxa^$sOp?$HL`tdOzo%I?5fD=<5XIJU>q>n
z;C6$o0`bKkj8FeA$vdu<P4<{=bALAcs^@=Vn2&BfpW#i~FL&j(Cc@z0KN!BhObh`g
zVBZcJJdaQWslXWMr9onbedOVvzXu0D=~G?4A;;eKX~yG!nmhApw8*23<JMWP!l4Z8
z>9est{5YAkRdx=JF_M`AB>Elgku*vq*hej``!s$CJr$JWr;NesS3|$=EKjNYGc@lx
z=fE#J-iJBM-hN^(eTVMUD96)W2+L04-&Mr0xU;{i2|L@h(g>D1a8yL!a4nSM>KH1N
zC6e=5@UObmObersvca=FHT0#xws<nz<F#xGNjs`VRB=+>BbT6*DK<Cf-thiCiak~E
z884UW+Uv1?9W#Ej)&2f5V(HsEtY#@rt!r8UvGrP;RZd=2w75`kn|d3e5Z2yZAO#`0
zW}%%Q`$6t63Y{xh+ZQ)h)CHnNA!^TfwUd;9_GzD;^>qM9D10Y^j)Vvl$#T^x1KO&8
z3@_uM+6Gd*dOl$8_*xG$yU>tftOG=J=E8P>3csU7y_t<ebolH}c-wilQ+O;Njv0<1
zd4DRx8eZ03vC_U@f3;0-iT;f<0r44D;(Bn@`A95n#O7i`aqY_eRu|CoU~1#`fH1~}
ziXY0%4Pp^_odU)_)2O=uP<V=zc`KS1e2CpZk*Ky|b-jt<*DXMME~*nEaIc~or>SFI
z_F9ny#KRcU@Yc0hhkH2}*eds#nAp4dZW7?EFfr*OBzpaNbag7$(1XCuZ124iVsNL2
zU3LAF4Lk5Tn017twCME$ihoC~7X3c~<KhNl7_AV7;9sHb4T-v+d#j-QGPoX(m5zn}
z1j8o96b#=DZWJ|p%*jYu8z`?IvmwXUhF06QM(%hD?bI7ze9f=@S0{?skm4Q?@f*o=
z8vGVRi-4(s?~E2A?L7XKlBJC5pPT?oNQhcmiH>TXabf%H8V|U9?2Q(1>qMZ43YN|c
z***n@vD%)`JBQQTTjpo>7ll!rj~QX<g9}bBU`HgUcaq6lrgw4PD*a6U-+|^JKn&2F
z+GsdW9#lA*SBSJ87)CvSX_wWl?NeM%@T#SjtiF%8Dtak<@{nPTCN9aEl>cBT4pRQ@
zdg8->8sg=Z&X~CWVLRF-a`GP!52l2I_4^G8!*#R}vk{xOeOu@!i^b)3%7glD`g6po
zDK8$=t@d+fe}8UQ{MtYMyEmRe%@8eCX&mRir;)IL5^hSrB_r<DT)cLNFgMzz`sw|7
zR^107fzzjKyj@<b5oYL@t+q2opWP#f!tp$OC}=m*-}8goORVY~cXteSSRk!yera^?
zet0s+2|%kPw>2E8{Skb7gdJ*UYS_yvfS+n1b4RmL`byo@*$N+*2xU`>k(71%qmlt}
zV`77PnuN%EflzgX^!sZ0CkyS)kWUSzLHi$-Nvbyy$>BV_!wp-3&DZ!J8GT*MFjeog
z{Bjsr6$dw#{66I0)K3u_;Ct3#nb1IYq9E3<-y8V7RWI^>NbodXdh4a45yjFaY3^pV
zSLEAqnuX@a<{Fu@&Fz8au1)vG3B?xp(EVtzd!+?o;!pi5YoB9s>fDg|famc^rik#6
ziC=bQHNX3~p}#^R=6JcLyC!}OY@N{;HH_I2nVGS(b8-k)6JA?uvvylh3}Q<SKiu5g
z%}}rptTJU-=l--#SxP-IuHrR2GZQQvXL$<iX}>epgb<pDoK>X+Trc8h#v}7{MRdPe
z5luFq%^ttg$TeBQoPSxoLh%b%yU_KjK#>V%<doP~rZToV*cK5H@nKO=v>)05)r6nu
zK=fe^4N}B6eE%g?kCw{FxGG7wU)ho;|5LW4{bYU@oIg!fD#&{&R8hq*GI3uAbSdY<
z-JAE!pHz0WBi)pQ1GJU7IvX3u%eK1u7)Mo+hda|W<e!4tT1J@iO-itGPB-R0fw$nF
zmUAgL9pCeNNcPr51Df~`bNh#O7^NJ!|C#a;&>W@EdbBAe)hy+6bxO)j9~mMQ3<#SD
zpDIKJaNmh9D=p8u^z^CJQ?jM==N-$kS{jiielvAAn*pL_C@Vud7v7EGWFgE{5K5mp
z%la4!hcM}WVW^Q~Q<-r!Qy$>9B6@v(brp@d%0A~?y&eN-{wPneIq!@VTU2wGtzL+}
z#tkdX#A<UCm6w(_d2;LZ6!B|=NNSYuq4)1bnHwtGXLJ$}B$A-VsZaQO3*{1cG;5U5
z8IvJcVVy*jDER2-4mhzg!`Skx<II#-1Eh-xAvSL04;CkZc<7}}!bJlJQ}h>AcHWa>
zP8H|1n1Jqw7=X71kShF;ip5lnCK$7TC8*HS%uNU4M1c1~@dT_P2Wu23H=$bpJT6Qd
zQgjx;@pi#b?BILB;B66<w2T0+knvFSfa$=PuTBsCMe5^kX(IqlL&pjM78#sl+}eI_
zcexT4<1FXh4x?qKVfWNo3hV3*rPJoVqhHl+!SpT$1Nq7%#<%x(88PjG&KESHaMl#@
zO;}HD|4vJ3DISV_=$r$6>h}aWY{UL|5T*kS?XMk!z?&+!!ZnPgqrW)ZfP8{sKx$zX
zsN<UqWbuy^b&-9QTC5cu@Zo4WJ}7bGvvY2(3uDt4^{cN`Xzx;A%8EO4K8_&&lcAVr
z3X-B<J>Lvgldh;&f<fUw+T)8oP4rar6bhtxI)RIyr2<IITrhaBxu^}5{p*BG0Q3rB
zvgwx2RiY80{<G?r%wobkaC&Hz744E1f>xi_09SBZ!Opv4UXAGFh2#K`9XRu@-M(s^
zf;kEfI&PZ4w`Nq3le-W5A|z%1_DyUa>{;ga(zf)ztdCChp@<D1gTtrQzt8I6$z)4{
znL5|jTXVy0k(mHTi?pJvE@=DEm`Bs;7Sk8+)mO_xoO@Wc?$t2^f*uQqTybl&bQ{;A
z5&tXmhNx~RGey?AN&&`rR#NbGwbE62@L6-lImuD_5}FJ)c~YSVfBvr1=S>98i@zk6
zZBYV`w`d>F#epZM+Qfi`U9)9Lhd=4haXtrIl#&ekn3YOTTD>s^<>oP^IGow#`>%M<
z(o$$ZgC;)Pq=@1h-U{_y-Xr0}vt)~3)^=`7aCdrqJ>FaSXQrMqLLAsErCoC+Fy*i1
zaNoftiRd|Gse<o*MY0y&8fx1XPP^)*{v_&Z{Pc`qP{1yDEZR97l9n*-Qs?f;JNgs<
ztnJZFP&%P}eY|#e3xpSK`v->|J$;g6lHT5gI9cUc%)V!%BIoEIpq%G;`bVvl$uQ~^
zpT0oE`~8|}cd%t8x}_d*e7)$XWWPDSIlg;<@ya~?LEG_Bv&Qj!H;VB7c?3+-M;9uU
zi0f9x*-dfrfc);kXN;}~%0!de5u1|N(q$)k`EK}3)rSw7c3~d+HK6TfGw{P0b=Ot>
z6Z)lyK?aJ9xSu$YQtf?Z>aZ&E_}QR+D-Z=l;h)PUewYhMH{O&rPLX{Ae|N6TV)|6>
z8oQPzM_6cBO&p^&DU5_Lk9I5CcZ!Np)IKTAZSggYu9z@WIS%_k9q^l?x!0&Y^Ssn!
zerj)On$7B9ktcFHJQ7VjVKx$Ear>Nwg~2DJcCG7gN9p<Ax@^Oj)Z0boQ>XNN)XwO^
z@Z^aVah=*0Dbv2CgXXo?Mcc4P?}<NZSKR+s1qC)wGqf<SEu#H3T)11~j<chLYE&{u
z5%3DEXbm3!c0=<?nGLR@QDvh7RF+HM4$nHoKnlXp|B{_&z6r7Bk7e<ESG_l|zS$HD
zoBlrYbN4gz@-A3PvAqSdMc7=b7AsP$okLmzWP`=5JSa^9cMqKjY<Fiploj+WIE(NI
z9O?Q~v7!-#HyrQd6ec25)Pb;)&(Kbo-jIs*CMUUiL%x2)IOP<kGS87-gSayox&!8z
z>s)a?H`&oTw3Qt$$oT>eq+)k=Zf3Fs;tkZ!vd;h2?}iAtO{4f?0j<|b=~&l_`Y~2G
z9Iw=Fa}(l#OzeO-yiQXlX|i6>Y}>*z-%E;X=Zm||cOKkl0j!*v{0sA0Z|MWTn82b@
zE|r$pCwlT}0ZPbJXYkYUu@<S^ZzL0J?j^CF*Y<{yhyvO$F+Ac^VT_12>Y-D4J;VY2
zdH$V>{<U}xCXx$|%dw8$T>7?9lu#%I(b2!I`3YsOL^57bBZQIOP>}?VPf`$O8h}fE
zK3iPa-01@qTMx%q&)G^l63js5LQZum9}tDKyj)SIJUJF#gomi5OYq_m7(YV(NND5K
zsN-W`dFr?M*<PuHjWFfi!@Zrc<&1IT?)cv;W^NCJq)fUO`~}&PL6_1JMOxQDW)U>8
zeBo(osyL!YwzagW=Y<_3Uvb@LD<9RDcAPA}dKTsZWdm_2(?g9nB!*FNZhE2&`sScQ
zRrzw+qML=qUCFgP6$e3E+e#hK68_Hh(M$ki_s_us_A>o8x=-@!n)O_DgUrqpU7_s=
z?c%p}bVzUBv0NTl^rM((2j#Uzusvt~g@GfrC}GM(BdE6JP3hp-dSbV8QPGR=DQ+!T
zylaPzTyk1^OJ%R<NoR(d$<?ROi+E&8*S~`;vc{y>ZYkol9tmpr-3-#>>1_}%-Wu_d
zg6%SU^?B}g%Fl_t&A7j2|LaMwp7U+6wFZWJn5&<&m@CF8H_*{w9J-jG7MVpI!Ce<g
zwe-nG$8VP~izP6%11M(_&!{*C@CRFpw|!(bdww0p?@-Ga1t?|Xilpx%eOeR#O$pL=
zcO&12QRt0TI(}lX{-23u;`XxX{7m<ZbDf|sD_w2WjfyF?qbWCM8tLK#<iWvC;CxqE
zfgoEenrE+4fq43y5`48Yo~GhOer2)Wx#2BCbNbfGT~bh0O18lRzv7v!yc=xvtgj3f
zdyLaf+2iJycCDfD7E`d%9PXvXMV$3neOJC<vurvKSjecL#E4d)#R~ez*CPKOb&l@2
z=N93IrO9%dlQwUYbi|EongqDxbu<(O`XHZJ;cx5w^v~Z0cc*x?rQj$$RP*@{FaT`<
zlW0E7mo*_zEQOKOM8u+_+=$#4>ssI2%9G`yu8R;J=TgBl--DO7Tr4=*q{2~O&!i10
zoG_;!6?msV+SRi&;roHBWk<Nncdp7*0kkRP%ZDc}i26fL#uPjtBS!fm{vLKGs(9t8
z+==S({{U;MTnV;NhC+?58(C9!=tCK!ytw{H69EJMV8i71jv2qZU66+)=so4pC^vE3
z0;4tH|NF(-|3cv>KWhbmC~R;;<_Y!FEs>!9FD%$#m|s*%B5#lyk5f%(qJ<6?*6!kK
zp4@QAb=-Y$znr!|A5uHf+5&@gAX-|RR<qDg3ut9G;kp)<cf0Gy^v|Dt*YkicF26od
zmg%RhpxX(mqcs3T9CV6QReHUjD_MMke8?09k;F`bkq&M&1jz<)+3pE*c8X&7jCwrV
zXuaT)!6!!*ZyKP@74-?72p7h=VNcSsS-6>(LuTB;6^Pr~m{}f@3-H`;2o;kP<6TbC
z{h|s0y6vi*yiXU0cGQ@8Ye?;F)IZ=w&6`X&uy<gEv$Zn;*}^B96R^tBjbm9GKbOlp
z@uIA!5H@bOKg+uTP{Sg)CMZ&XT!J|eIF$8SROH|G2j^Y2WY5jeJ(9Z24E#A9I%RtY
zLAKg&y=r;KEqrir*d3iy<T}Q%lTPB*f+VXxa=wrsJQ6EaL;$M{M3j_?MQ6PsywE-g
zwEu-@->c|?-R?b#V8a@+Xm|rxS3}fwV99_$2LN`nhHWmAguJ2f<?I{Q{M|l3ZkWAF
zRr?j>F}rIQ1^3|=)eW1)5mcS)&Gf93^(M#OzYtm8|76Xn`ugHH0VwRh=X?qlbB&Cq
z5iHwzqARI9ot%Z=Bw;$|@e#iyw4m}o{Ssa+h+(eekG!FHWP#uis7*iBxz7<Ahsg@<
zr+}i)5hXCd62E9xx23jGpJJvMlpsZ}UjX+(!$l<D##r9eg4x(*>l99wia}ke|D$$`
zD`}LYU^QPjAD{dn`Da;d_m}OZB3fMKA_-NPw01&&Wx;zLwD6(UtsIT5E%2evi3eb=
zz!h!JWz6t8-i}m1q_$;j@rEeo^jlIh5_hi{*NPkF_;syF?ZLS_CLY{Ry8g>Fpju?|
zQnKFVz><OuwFTs?RWIypfrik7JL9gYEsq$ahj%Tm^t}XEaq0$~7_w=Tp7!eDdqm~X
z@anTHYgx*FVs`z1?*H&d{gI2DY;mIeJH{qfW0lkGYGWQS7+a4Xl#D!LD4Hy`<C@3E
zr)iAFsvQT|57*E@pH{o7dZzuuYN?ZerqGQHDitG0#TZG+CC7JDsr0^DEGe;|v~sFp
zUMHhi8lKzm>YNN`xGn}3JK<YzSG7$n8EiYcvMx`Q-^3{Q-3Bv&?LqOKKLakO&3FV^
z(!`Uh>nVyRotkedWmoB-CCM|^oVFDL71RCE<#@Cx;wdV0v<F&Zw&~(Z&nGXKRt)#^
z90(aFKI~5Dd1+)Q*BAQHw&ie|-RSwtljq%%;>dto)Cp_l4<B2<PM!_0upD48Zks8=
z7I*JAS;-%-mw+;+kMMS7i~PC{ylO%;LV1k7UGo3$T|-=&JzT%b*X09a20(8Y04lIj
zx2oTy)|d$e++_#RLgj1hc(d__2xvA4{OD5alN>`yEDhDev{e5XPw6p%0ikecoP&^k
zr2-gYZE1nYL6fhMLAO^_#^KStYo}_E;UnRr-$7m+V#asZ_QzbOx#{!XX!B|Dp3oT|
zIP}vC>O;Fmy=$s>W?I^|I`Nz<kR|$1L-^Dw<0+3_UU;M09mkMW#BI`r|3~i1o!+}m
zADe6>eY6v999jtkp@q4pD)@C$^z+xQM&~HmV4g<(IP`4x0p!p6!BeX(4Fqm{4QtIb
zAs!x$C{927zr%|@ci)A0g}bmTtcWm0-zNMejvG(cGj@OZhUZc-{QAO<hTgM`zoI19
zf3Z(F;YC7i)3ML7hBS9@NZvj;r~oeioMU~g_E!8RuWsfLf-C8(c0#c$>5#k;TuSSe
zkfuianLL)nFE`IHNxfcGRbfL>L)BrpL(z3vMayUVvVc~~rHpel)RQStq#9^pamTG9
zK>u>Ws~7;aPmE#K##bW^AFB3rieoxO#bI!Kb$uXL!n2&mSOWmA>-{5JUz*Kvr2Fw&
z2<#$`Ab2(P;>6xDTNWv^?V17z1;;qcGGOpr>Q%@Kb8MZqs~$&wYEpc$A4yQ|wWI8|
zRBYS8Udv~6;p1Bmm)Qx`&%X=0fttV8-f7R~+Yedm<VgI$gSzT3A-Ni48F1ruc;n42
z!{m(fK>L?RNmMa^&(t_30CH*4g_XDCn<{m`l_e7yi5^%)@um350|#^-D_nMf8DCQI
z4_OSX6F^BXc{X0|eTw|K&!ms%v`d!&k6QN!o$*7814(1phY7E!ym)}BGs{DU&F&yS
z0n7lCzSI+)dvdI2bzHI;99pTaZWW-$F!s}Emq5Xf)on1>d0-%Q_{Uy@=TP&lX=Y?d
z0Pzw?_?GD_uB4!>^Xyq2zMP1h_zi|WkR%Tk7*2ckhW@RmNtO`Y!mdg0!{w(7Z<LH`
z!<3(d{A00HO=pk02h}$orw7L8-m>Gi*NCir=#qF^LL9Te=&5^o5W+Y(pi5(FXf!v&
ze^0avBtfAuVPOa9MNhD$w0#7oasiV{?OS37{_fROtUd%AakY3ifufh<7)V#Z-H)Ei
z;Zx_`YQ<Zvs(A6F$)$<irB(OlkkVC?b@ivwL}Fq>ezAPrFc!R(FhIjEaY|*(?&I$u
zaJ%_-mU!yd#gznz>aS#Q8|VU$Sl+H(mN>ry>l`BC(1EACXH)uIjq85bgtan}1@ff3
zMQTlA=Kw)=7EXNhwZ~yypmV6@wW+Y9h&niH`(1G^_7I__(e@jh_!$XFQq(xZ^LP#^
zi=cW0^=mtIzEPzy6Wj4+B{+C5P=w7P`$q?}?sbDV$e&bujWfmydR6iAHy0bny;8!i
z<d~eT{k3w3LEm5!T?omY8<wt}b}!HT3$fhy_(zD_ctZJZ2%2d*m2cA9W9|XpL8rZX
zx%}{cv)4{gWOMVH9HIV%Z6d1I8Pwg6Xwk1~^wruyKv&C$$8&VLF<%!ke!sap9`OY1
zd)5UK_9!bU_xfYYZxNlOaq55phd`_>$Mu<doAm|%a-xl3;e|gj?^oie-AQX96ikB}
zkv&)P1p7T%5{?LRJ_gDHxq)T&h^$Mn#Q|Ee$E4Irr^E?=bhV8@1+C`FB$fox7sfr^
z*}wZK_MOeN?Okze5UOeelM_XR&jQO?4M>uUM{eu&W9pzrkjG8Lze{iV4_DrgJX}X)
z4W842O0FpOTS6Z<586l7Tc_|DZ6>aab{XD3SepN8;97pBSJ8GXCU~NyMVK2v<A?HP
z%Il5$l1^lE$}Jh1sx<4U_jJE7rHp}`&b+p;E?A|)_+!(OmmHVOZ>s+J$ihbGdIG>%
z!)No1ifuQWTn|JE9a8Jz9{pQeGJ_Y9ucEZ1%`MWy6-?r_FB>C?reIijecHErE<4C0
zXlE&LjO45i4B4>2!7d;di#Eq6-OGBAbiOgi+v!~ou6zt<#39CbSfUWgID#}I6Pe}b
zKoE8`fxVmvJF5o=xIq)lrSv!l{_dJ`VuMGw_tOU4>VOvhg14bGfwySJU%mO`hu)O1
z#@P#|lImQLi}-gV-%*J`9E8wUxbZX=I?RUXC8s-QWSk4hJxz_`@35lnw<?jC5_k&;
z%s@8F<njiZ87`K})@sUeCK}{vATQ3Ymd8K@U6Q`qxF+b|gGs&BFZy`87bb~fhcZ*^
zzNUh+BYibDS<}Z*Yua8bm4zLE70Ll)%#Kl5Lr5~xiGY!AuJp?CwDotuPcQof0nDgX
zWVw=!+bqs-waHVTv!LEnQ3V~JR9)!bse12JJIrh$$#Vf<B5>#qS+m1`#pS!o6yd9{
zf>NhR)mKF|fn0H!untu6^$y4<omnm&cKF=k^Uu{ICWcG<AQXcfVx3-y2X)<QV2t%|
zds01AlRcZ?x1R>8ork(%cW`k30;+JK!<=d~@=`zJ6aWz;sa2q&8t9+9Mi=%)j^8)6
zGLb#=?AO|6tZ@b6cJgHKhC?jwNwM-}A~>dwbiI8zHNc<*M#50eBP`HI>Y!yj*zXYs
zQ}EvdJr+pz)taU3pbkvYV@g&~xWc8YmRt0dfQx(bd2$rQNDiuAspiVM)t7I+A6kML
zWZvgt+!Rz_ZI2bslK+f1Aclt%3L#dKU_B2Cj1Z1j)zxemn4XQ8RP!XK_tj3Y<8~=O
zPiV>(1&67j>E(eow4MhkOR9xN-ZP0zC~u!?^5Tyr$0^3XC`YYg*5{c}rLxjchVK^&
zihH*8(GK_NgT1{y%Akg`bzoq_I}#hj4=KtXEvgG?Fz&u&r=D|~eX3!>*2uY*XBIqp
z9?+(ztKT@)f;HY;3I7~nB2EpkxMiBLwDNwo(O9W&+8^yhvU0Ad$3lAFh}?qzzX<A8
z8#2+>D#Y{XQ@qs47}3nZ;*i{P0e2PkvZ&+BV<iO6!@YxHZwut=#dp_71v{iRoANa@
z?86U!Uo*)3+9JgXnO<6wU}(GX7S4nr|9Kd^w;j+H?#O?QeMb-EDaJmy(z7V_lzEn6
zJNLLp{A`Py$sHWZ@e5o~Z-eNX<@7Q{ecsl5wB_yof<M~w#HN9dnwr{m_3zVn<p-Z*
zvw0?DBX^EE^H)oY>63ORAM^IU<SH1$<@$kv`%#aLzus*&^7UV*=$?$t;V1dD6YV2X
z&h`5n@!$~pzyKn&r+o72ASa6kr1QP&HSIT}GLZ0DMy+h(;y3i2G%z_Z+ZtX710NcM
zzfGgDTNLrf3VkanJ$pwkYKXDokM7FMehSLIJe#M+jh)MpV-7}j$n3rv5CqflxA7;3
zv<2l4>$PWyeD~WySXYIU@A<UO+E$UP#xq_j?i51AyuY>mwzHF<+N@}mmR{&>x;~op
zSTq?tkz9IN=Qc&9V<0T$i7D&!{aJgvSxuwnR<Fs3Mg?Kv+^ph|<D<J-68qC`*5ft;
zU}pC(CdIWvd9VuH`H9iYR882#2xnD$7>4jutv`p{NvrY5>iq3OSd39U<7ll!vf}I~
z@_4C7gpCJ*MgzQP&#)0>rTUe~YpPk%)`KjCf1D{R{Q|dDPSuik=Un7~_03J0gn}l|
zY!;XUq*~-}JBXaQF;!Kc4DHW6$9fxQe%FHJk}ocf8mK7+Q&T!-=3~s-yzkPlplh(v
zJlQ;{!@FVI;H?DC6+J-tNgHo2K%LMskUovQLq)(L+Q6^O3hzbbel&Oo=mPnc&V5%Q
bzm8AbuVoE9qaMlyfL|Cr6WvPfd(r;^ifYg6

diff --git a/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/utils_image.py b/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/utils_image.py
deleted file mode 100644
index 0175f155ad90..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/image_degradation/utils_image.py
+++ /dev/null
@@ -1,916 +0,0 @@
-import os
-import math
-import random
-import numpy as np
-import torch
-import cv2
-from torchvision.utils import make_grid
-from datetime import datetime
-#import matplotlib.pyplot as plt   # TODO: check with Dominik, also bsrgan.py vs bsrgan_light.py
-
-
-os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
-
-
-'''
-# --------------------------------------------
-# Kai Zhang (github: https://github.com/cszn)
-# 03/Mar/2019
-# --------------------------------------------
-# https://github.com/twhui/SRGAN-pyTorch
-# https://github.com/xinntao/BasicSR
-# --------------------------------------------
-'''
-
-
-IMG_EXTENSIONS = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', '.tif']
-
-
-def is_image_file(filename):
-    return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
-
-
-def get_timestamp():
-    return datetime.now().strftime('%y%m%d-%H%M%S')
-
-
-def imshow(x, title=None, cbar=False, figsize=None):
-    plt.figure(figsize=figsize)
-    plt.imshow(np.squeeze(x), interpolation='nearest', cmap='gray')
-    if title:
-        plt.title(title)
-    if cbar:
-        plt.colorbar()
-    plt.show()
-
-
-def surf(Z, cmap='rainbow', figsize=None):
-    plt.figure(figsize=figsize)
-    ax3 = plt.axes(projection='3d')
-
-    w, h = Z.shape[:2]
-    xx = np.arange(0,w,1)
-    yy = np.arange(0,h,1)
-    X, Y = np.meshgrid(xx, yy)
-    ax3.plot_surface(X,Y,Z,cmap=cmap)
-    #ax3.contour(X,Y,Z, zdim='z',offset=-2，cmap=cmap)
-    plt.show()
-
-
-'''
-# --------------------------------------------
-# get image pathes
-# --------------------------------------------
-'''
-
-
-def get_image_paths(dataroot):
-    paths = None  # return None if dataroot is None
-    if dataroot is not None:
-        paths = sorted(_get_paths_from_images(dataroot))
-    return paths
-
-
-def _get_paths_from_images(path):
-    assert os.path.isdir(path), '{:s} is not a valid directory'.format(path)
-    images = []
-    for dirpath, _, fnames in sorted(os.walk(path)):
-        for fname in sorted(fnames):
-            if is_image_file(fname):
-                img_path = os.path.join(dirpath, fname)
-                images.append(img_path)
-    assert images, '{:s} has no valid image file'.format(path)
-    return images
-
-
-'''
-# --------------------------------------------
-# split large images into small images 
-# --------------------------------------------
-'''
-
-
-def patches_from_image(img, p_size=512, p_overlap=64, p_max=800):
-    w, h = img.shape[:2]
-    patches = []
-    if w > p_max and h > p_max:
-        w1 = list(np.arange(0, w-p_size, p_size-p_overlap, dtype=np.int))
-        h1 = list(np.arange(0, h-p_size, p_size-p_overlap, dtype=np.int))
-        w1.append(w-p_size)
-        h1.append(h-p_size)
-#        print(w1)
-#        print(h1)
-        for i in w1:
-            for j in h1:
-                patches.append(img[i:i+p_size, j:j+p_size,:])
-    else:
-        patches.append(img)
-
-    return patches
-
-
-def imssave(imgs, img_path):
-    """
-    imgs: list, N images of size WxHxC
-    """
-    img_name, ext = os.path.splitext(os.path.basename(img_path))
-
-    for i, img in enumerate(imgs):
-        if img.ndim == 3:
-            img = img[:, :, [2, 1, 0]]
-        new_path = os.path.join(os.path.dirname(img_path), img_name+str('_s{:04d}'.format(i))+'.png')
-        cv2.imwrite(new_path, img)
-
-
-def split_imageset(original_dataroot, taget_dataroot, n_channels=3, p_size=800, p_overlap=96, p_max=1000):
-    """
-    split the large images from original_dataroot into small overlapped images with size (p_size)x(p_size),
-    and save them into taget_dataroot; only the images with larger size than (p_max)x(p_max)
-    will be splitted.
-    Args:
-        original_dataroot:
-        taget_dataroot:
-        p_size: size of small images
-        p_overlap: patch size in training is a good choice
-        p_max: images with smaller size than (p_max)x(p_max) keep unchanged.
-    """
-    paths = get_image_paths(original_dataroot)
-    for img_path in paths:
-        # img_name, ext = os.path.splitext(os.path.basename(img_path))
-        img = imread_uint(img_path, n_channels=n_channels)
-        patches = patches_from_image(img, p_size, p_overlap, p_max)
-        imssave(patches, os.path.join(taget_dataroot,os.path.basename(img_path)))
-        #if original_dataroot == taget_dataroot:
-        #del img_path
-
-'''
-# --------------------------------------------
-# makedir
-# --------------------------------------------
-'''
-
-
-def mkdir(path):
-    if not os.path.exists(path):
-        os.makedirs(path)
-
-
-def mkdirs(paths):
-    if isinstance(paths, str):
-        mkdir(paths)
-    else:
-        for path in paths:
-            mkdir(path)
-
-
-def mkdir_and_rename(path):
-    if os.path.exists(path):
-        new_name = path + '_archived_' + get_timestamp()
-        print('Path already exists. Rename it to [{:s}]'.format(new_name))
-        os.rename(path, new_name)
-    os.makedirs(path)
-
-
-'''
-# --------------------------------------------
-# read image from path
-# opencv is fast, but read BGR numpy image
-# --------------------------------------------
-'''
-
-
-# --------------------------------------------
-# get uint8 image of size HxWxn_channles (RGB)
-# --------------------------------------------
-def imread_uint(path, n_channels=3):
-    #  input: path
-    # output: HxWx3(RGB or GGG), or HxWx1 (G)
-    if n_channels == 1:
-        img = cv2.imread(path, 0)  # cv2.IMREAD_GRAYSCALE
-        img = np.expand_dims(img, axis=2)  # HxWx1
-    elif n_channels == 3:
-        img = cv2.imread(path, cv2.IMREAD_UNCHANGED)  # BGR or G
-        if img.ndim == 2:
-            img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)  # GGG
-        else:
-            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # RGB
-    return img
-
-
-# --------------------------------------------
-# matlab's imwrite
-# --------------------------------------------
-def imsave(img, img_path):
-    img = np.squeeze(img)
-    if img.ndim == 3:
-        img = img[:, :, [2, 1, 0]]
-    cv2.imwrite(img_path, img)
-
-def imwrite(img, img_path):
-    img = np.squeeze(img)
-    if img.ndim == 3:
-        img = img[:, :, [2, 1, 0]]
-    cv2.imwrite(img_path, img)
-
-
-
-# --------------------------------------------
-# get single image of size HxWxn_channles (BGR)
-# --------------------------------------------
-def read_img(path):
-    # read image by cv2
-    # return: Numpy float32, HWC, BGR, [0,1]
-    img = cv2.imread(path, cv2.IMREAD_UNCHANGED)  # cv2.IMREAD_GRAYSCALE
-    img = img.astype(np.float32) / 255.
-    if img.ndim == 2:
-        img = np.expand_dims(img, axis=2)
-    # some images have 4 channels
-    if img.shape[2] > 3:
-        img = img[:, :, :3]
-    return img
-
-
-'''
-# --------------------------------------------
-# image format conversion
-# --------------------------------------------
-# numpy(single) <--->  numpy(unit)
-# numpy(single) <--->  tensor
-# numpy(unit)   <--->  tensor
-# --------------------------------------------
-'''
-
-
-# --------------------------------------------
-# numpy(single) [0, 1] <--->  numpy(unit)
-# --------------------------------------------
-
-
-def uint2single(img):
-
-    return np.float32(img/255.)
-
-
-def single2uint(img):
-
-    return np.uint8((img.clip(0, 1)*255.).round())
-
-
-def uint162single(img):
-
-    return np.float32(img/65535.)
-
-
-def single2uint16(img):
-
-    return np.uint16((img.clip(0, 1)*65535.).round())
-
-
-# --------------------------------------------
-# numpy(unit) (HxWxC or HxW) <--->  tensor
-# --------------------------------------------
-
-
-# convert uint to 4-dimensional torch tensor
-def uint2tensor4(img):
-    if img.ndim == 2:
-        img = np.expand_dims(img, axis=2)
-    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().div(255.).unsqueeze(0)
-
-
-# convert uint to 3-dimensional torch tensor
-def uint2tensor3(img):
-    if img.ndim == 2:
-        img = np.expand_dims(img, axis=2)
-    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().div(255.)
-
-
-# convert 2/3/4-dimensional torch tensor to uint
-def tensor2uint(img):
-    img = img.data.squeeze().float().clamp_(0, 1).cpu().numpy()
-    if img.ndim == 3:
-        img = np.transpose(img, (1, 2, 0))
-    return np.uint8((img*255.0).round())
-
-
-# --------------------------------------------
-# numpy(single) (HxWxC) <--->  tensor
-# --------------------------------------------
-
-
-# convert single (HxWxC) to 3-dimensional torch tensor
-def single2tensor3(img):
-    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float()
-
-
-# convert single (HxWxC) to 4-dimensional torch tensor
-def single2tensor4(img):
-    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().unsqueeze(0)
-
-
-# convert torch tensor to single
-def tensor2single(img):
-    img = img.data.squeeze().float().cpu().numpy()
-    if img.ndim == 3:
-        img = np.transpose(img, (1, 2, 0))
-
-    return img
-
-# convert torch tensor to single
-def tensor2single3(img):
-    img = img.data.squeeze().float().cpu().numpy()
-    if img.ndim == 3:
-        img = np.transpose(img, (1, 2, 0))
-    elif img.ndim == 2:
-        img = np.expand_dims(img, axis=2)
-    return img
-
-
-def single2tensor5(img):
-    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1, 3).float().unsqueeze(0)
-
-
-def single32tensor5(img):
-    return torch.from_numpy(np.ascontiguousarray(img)).float().unsqueeze(0).unsqueeze(0)
-
-
-def single42tensor4(img):
-    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1, 3).float()
-
-
-# from skimage.io import imread, imsave
-def tensor2img(tensor, out_type=np.uint8, min_max=(0, 1)):
-    '''
-    Converts a torch Tensor into an image Numpy array of BGR channel order
-    Input: 4D(B,(3/1),H,W), 3D(C,H,W), or 2D(H,W), any range, RGB channel order
-    Output: 3D(H,W,C) or 2D(H,W), [0,255], np.uint8 (default)
-    '''
-    tensor = tensor.squeeze().float().cpu().clamp_(*min_max)  # squeeze first, then clamp
-    tensor = (tensor - min_max[0]) / (min_max[1] - min_max[0])  # to range [0,1]
-    n_dim = tensor.dim()
-    if n_dim == 4:
-        n_img = len(tensor)
-        img_np = make_grid(tensor, nrow=int(math.sqrt(n_img)), normalize=False).numpy()
-        img_np = np.transpose(img_np[[2, 1, 0], :, :], (1, 2, 0))  # HWC, BGR
-    elif n_dim == 3:
-        img_np = tensor.numpy()
-        img_np = np.transpose(img_np[[2, 1, 0], :, :], (1, 2, 0))  # HWC, BGR
-    elif n_dim == 2:
-        img_np = tensor.numpy()
-    else:
-        raise TypeError(
-            'Only support 4D, 3D and 2D tensor. But received with dimension: {:d}'.format(n_dim))
-    if out_type == np.uint8:
-        img_np = (img_np * 255.0).round()
-        # Important. Unlike matlab, numpy.unit8() WILL NOT round by default.
-    return img_np.astype(out_type)
-
-
-'''
-# --------------------------------------------
-# Augmentation, flipe and/or rotate
-# --------------------------------------------
-# The following two are enough.
-# (1) augmet_img: numpy image of WxHxC or WxH
-# (2) augment_img_tensor4: tensor image 1xCxWxH
-# --------------------------------------------
-'''
-
-
-def augment_img(img, mode=0):
-    '''Kai Zhang (github: https://github.com/cszn)
-    '''
-    if mode == 0:
-        return img
-    elif mode == 1:
-        return np.flipud(np.rot90(img))
-    elif mode == 2:
-        return np.flipud(img)
-    elif mode == 3:
-        return np.rot90(img, k=3)
-    elif mode == 4:
-        return np.flipud(np.rot90(img, k=2))
-    elif mode == 5:
-        return np.rot90(img)
-    elif mode == 6:
-        return np.rot90(img, k=2)
-    elif mode == 7:
-        return np.flipud(np.rot90(img, k=3))
-
-
-def augment_img_tensor4(img, mode=0):
-    '''Kai Zhang (github: https://github.com/cszn)
-    '''
-    if mode == 0:
-        return img
-    elif mode == 1:
-        return img.rot90(1, [2, 3]).flip([2])
-    elif mode == 2:
-        return img.flip([2])
-    elif mode == 3:
-        return img.rot90(3, [2, 3])
-    elif mode == 4:
-        return img.rot90(2, [2, 3]).flip([2])
-    elif mode == 5:
-        return img.rot90(1, [2, 3])
-    elif mode == 6:
-        return img.rot90(2, [2, 3])
-    elif mode == 7:
-        return img.rot90(3, [2, 3]).flip([2])
-
-
-def augment_img_tensor(img, mode=0):
-    '''Kai Zhang (github: https://github.com/cszn)
-    '''
-    img_size = img.size()
-    img_np = img.data.cpu().numpy()
-    if len(img_size) == 3:
-        img_np = np.transpose(img_np, (1, 2, 0))
-    elif len(img_size) == 4:
-        img_np = np.transpose(img_np, (2, 3, 1, 0))
-    img_np = augment_img(img_np, mode=mode)
-    img_tensor = torch.from_numpy(np.ascontiguousarray(img_np))
-    if len(img_size) == 3:
-        img_tensor = img_tensor.permute(2, 0, 1)
-    elif len(img_size) == 4:
-        img_tensor = img_tensor.permute(3, 2, 0, 1)
-
-    return img_tensor.type_as(img)
-
-
-def augment_img_np3(img, mode=0):
-    if mode == 0:
-        return img
-    elif mode == 1:
-        return img.transpose(1, 0, 2)
-    elif mode == 2:
-        return img[::-1, :, :]
-    elif mode == 3:
-        img = img[::-1, :, :]
-        img = img.transpose(1, 0, 2)
-        return img
-    elif mode == 4:
-        return img[:, ::-1, :]
-    elif mode == 5:
-        img = img[:, ::-1, :]
-        img = img.transpose(1, 0, 2)
-        return img
-    elif mode == 6:
-        img = img[:, ::-1, :]
-        img = img[::-1, :, :]
-        return img
-    elif mode == 7:
-        img = img[:, ::-1, :]
-        img = img[::-1, :, :]
-        img = img.transpose(1, 0, 2)
-        return img
-
-
-def augment_imgs(img_list, hflip=True, rot=True):
-    # horizontal flip OR rotate
-    hflip = hflip and random.random() < 0.5
-    vflip = rot and random.random() < 0.5
-    rot90 = rot and random.random() < 0.5
-
-    def _augment(img):
-        if hflip:
-            img = img[:, ::-1, :]
-        if vflip:
-            img = img[::-1, :, :]
-        if rot90:
-            img = img.transpose(1, 0, 2)
-        return img
-
-    return [_augment(img) for img in img_list]
-
-
-'''
-# --------------------------------------------
-# modcrop and shave
-# --------------------------------------------
-'''
-
-
-def modcrop(img_in, scale):
-    # img_in: Numpy, HWC or HW
-    img = np.copy(img_in)
-    if img.ndim == 2:
-        H, W = img.shape
-        H_r, W_r = H % scale, W % scale
-        img = img[:H - H_r, :W - W_r]
-    elif img.ndim == 3:
-        H, W, C = img.shape
-        H_r, W_r = H % scale, W % scale
-        img = img[:H - H_r, :W - W_r, :]
-    else:
-        raise ValueError('Wrong img ndim: [{:d}].'.format(img.ndim))
-    return img
-
-
-def shave(img_in, border=0):
-    # img_in: Numpy, HWC or HW
-    img = np.copy(img_in)
-    h, w = img.shape[:2]
-    img = img[border:h-border, border:w-border]
-    return img
-
-
-'''
-# --------------------------------------------
-# image processing process on numpy image
-# channel_convert(in_c, tar_type, img_list):
-# rgb2ycbcr(img, only_y=True):
-# bgr2ycbcr(img, only_y=True):
-# ycbcr2rgb(img):
-# --------------------------------------------
-'''
-
-
-def rgb2ycbcr(img, only_y=True):
-    '''same as matlab rgb2ycbcr
-    only_y: only return Y channel
-    Input:
-        uint8, [0, 255]
-        float, [0, 1]
-    '''
-    in_img_type = img.dtype
-    img.astype(np.float32)
-    if in_img_type != np.uint8:
-        img *= 255.
-    # convert
-    if only_y:
-        rlt = np.dot(img, [65.481, 128.553, 24.966]) / 255.0 + 16.0
-    else:
-        rlt = np.matmul(img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786],
-                              [24.966, 112.0, -18.214]]) / 255.0 + [16, 128, 128]
-    if in_img_type == np.uint8:
-        rlt = rlt.round()
-    else:
-        rlt /= 255.
-    return rlt.astype(in_img_type)
-
-
-def ycbcr2rgb(img):
-    '''same as matlab ycbcr2rgb
-    Input:
-        uint8, [0, 255]
-        float, [0, 1]
-    '''
-    in_img_type = img.dtype
-    img.astype(np.float32)
-    if in_img_type != np.uint8:
-        img *= 255.
-    # convert
-    rlt = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621], [0, -0.00153632, 0.00791071],
-                          [0.00625893, -0.00318811, 0]]) * 255.0 + [-222.921, 135.576, -276.836]
-    if in_img_type == np.uint8:
-        rlt = rlt.round()
-    else:
-        rlt /= 255.
-    return rlt.astype(in_img_type)
-
-
-def bgr2ycbcr(img, only_y=True):
-    '''bgr version of rgb2ycbcr
-    only_y: only return Y channel
-    Input:
-        uint8, [0, 255]
-        float, [0, 1]
-    '''
-    in_img_type = img.dtype
-    img.astype(np.float32)
-    if in_img_type != np.uint8:
-        img *= 255.
-    # convert
-    if only_y:
-        rlt = np.dot(img, [24.966, 128.553, 65.481]) / 255.0 + 16.0
-    else:
-        rlt = np.matmul(img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],
-                              [65.481, -37.797, 112.0]]) / 255.0 + [16, 128, 128]
-    if in_img_type == np.uint8:
-        rlt = rlt.round()
-    else:
-        rlt /= 255.
-    return rlt.astype(in_img_type)
-
-
-def channel_convert(in_c, tar_type, img_list):
-    # conversion among BGR, gray and y
-    if in_c == 3 and tar_type == 'gray':  # BGR to gray
-        gray_list = [cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) for img in img_list]
-        return [np.expand_dims(img, axis=2) for img in gray_list]
-    elif in_c == 3 and tar_type == 'y':  # BGR to y
-        y_list = [bgr2ycbcr(img, only_y=True) for img in img_list]
-        return [np.expand_dims(img, axis=2) for img in y_list]
-    elif in_c == 1 and tar_type == 'RGB':  # gray/y to BGR
-        return [cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) for img in img_list]
-    else:
-        return img_list
-
-
-'''
-# --------------------------------------------
-# metric, PSNR and SSIM
-# --------------------------------------------
-'''
-
-
-# --------------------------------------------
-# PSNR
-# --------------------------------------------
-def calculate_psnr(img1, img2, border=0):
-    # img1 and img2 have range [0, 255]
-    #img1 = img1.squeeze()
-    #img2 = img2.squeeze()
-    if not img1.shape == img2.shape:
-        raise ValueError('Input images must have the same dimensions.')
-    h, w = img1.shape[:2]
-    img1 = img1[border:h-border, border:w-border]
-    img2 = img2[border:h-border, border:w-border]
-
-    img1 = img1.astype(np.float64)
-    img2 = img2.astype(np.float64)
-    mse = np.mean((img1 - img2)**2)
-    if mse == 0:
-        return float('inf')
-    return 20 * math.log10(255.0 / math.sqrt(mse))
-
-
-# --------------------------------------------
-# SSIM
-# --------------------------------------------
-def calculate_ssim(img1, img2, border=0):
-    '''calculate SSIM
-    the same outputs as MATLAB's
-    img1, img2: [0, 255]
-    '''
-    #img1 = img1.squeeze()
-    #img2 = img2.squeeze()
-    if not img1.shape == img2.shape:
-        raise ValueError('Input images must have the same dimensions.')
-    h, w = img1.shape[:2]
-    img1 = img1[border:h-border, border:w-border]
-    img2 = img2[border:h-border, border:w-border]
-
-    if img1.ndim == 2:
-        return ssim(img1, img2)
-    elif img1.ndim == 3:
-        if img1.shape[2] == 3:
-            ssims = []
-            for i in range(3):
-                ssims.append(ssim(img1[:,:,i], img2[:,:,i]))
-            return np.array(ssims).mean()
-        elif img1.shape[2] == 1:
-            return ssim(np.squeeze(img1), np.squeeze(img2))
-    else:
-        raise ValueError('Wrong input image dimensions.')
-
-
-def ssim(img1, img2):
-    C1 = (0.01 * 255)**2
-    C2 = (0.03 * 255)**2
-
-    img1 = img1.astype(np.float64)
-    img2 = img2.astype(np.float64)
-    kernel = cv2.getGaussianKernel(11, 1.5)
-    window = np.outer(kernel, kernel.transpose())
-
-    mu1 = cv2.filter2D(img1, -1, window)[5:-5, 5:-5]  # valid
-    mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5]
-    mu1_sq = mu1**2
-    mu2_sq = mu2**2
-    mu1_mu2 = mu1 * mu2
-    sigma1_sq = cv2.filter2D(img1**2, -1, window)[5:-5, 5:-5] - mu1_sq
-    sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
-    sigma12 = cv2.filter2D(img1 * img2, -1, window)[5:-5, 5:-5] - mu1_mu2
-
-    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) *
-                                                            (sigma1_sq + sigma2_sq + C2))
-    return ssim_map.mean()
-
-
-'''
-# --------------------------------------------
-# matlab's bicubic imresize (numpy and torch) [0, 1]
-# --------------------------------------------
-'''
-
-
-# matlab 'imresize' function, now only support 'bicubic'
-def cubic(x):
-    absx = torch.abs(x)
-    absx2 = absx**2
-    absx3 = absx**3
-    return (1.5*absx3 - 2.5*absx2 + 1) * ((absx <= 1).type_as(absx)) + \
-        (-0.5*absx3 + 2.5*absx2 - 4*absx + 2) * (((absx > 1)*(absx <= 2)).type_as(absx))
-
-
-def calculate_weights_indices(in_length, out_length, scale, kernel, kernel_width, antialiasing):
-    if (scale < 1) and (antialiasing):
-        # Use a modified kernel to simultaneously interpolate and antialias- larger kernel width
-        kernel_width = kernel_width / scale
-
-    # Output-space coordinates
-    x = torch.linspace(1, out_length, out_length)
-
-    # Input-space coordinates. Calculate the inverse mapping such that 0.5
-    # in output space maps to 0.5 in input space, and 0.5+scale in output
-    # space maps to 1.5 in input space.
-    u = x / scale + 0.5 * (1 - 1 / scale)
-
-    # What is the left-most pixel that can be involved in the computation?
-    left = torch.floor(u - kernel_width / 2)
-
-    # What is the maximum number of pixels that can be involved in the
-    # computation?  Note: it's OK to use an extra pixel here; if the
-    # corresponding weights are all zero, it will be eliminated at the end
-    # of this function.
-    P = math.ceil(kernel_width) + 2
-
-    # The indices of the input pixels involved in computing the k-th output
-    # pixel are in row k of the indices matrix.
-    indices = left.view(out_length, 1).expand(out_length, P) + torch.linspace(0, P - 1, P).view(
-        1, P).expand(out_length, P)
-
-    # The weights used to compute the k-th output pixel are in row k of the
-    # weights matrix.
-    distance_to_center = u.view(out_length, 1).expand(out_length, P) - indices
-    # apply cubic kernel
-    if (scale < 1) and (antialiasing):
-        weights = scale * cubic(distance_to_center * scale)
-    else:
-        weights = cubic(distance_to_center)
-    # Normalize the weights matrix so that each row sums to 1.
-    weights_sum = torch.sum(weights, 1).view(out_length, 1)
-    weights = weights / weights_sum.expand(out_length, P)
-
-    # If a column in weights is all zero, get rid of it. only consider the first and last column.
-    weights_zero_tmp = torch.sum((weights == 0), 0)
-    if not math.isclose(weights_zero_tmp[0], 0, rel_tol=1e-6):
-        indices = indices.narrow(1, 1, P - 2)
-        weights = weights.narrow(1, 1, P - 2)
-    if not math.isclose(weights_zero_tmp[-1], 0, rel_tol=1e-6):
-        indices = indices.narrow(1, 0, P - 2)
-        weights = weights.narrow(1, 0, P - 2)
-    weights = weights.contiguous()
-    indices = indices.contiguous()
-    sym_len_s = -indices.min() + 1
-    sym_len_e = indices.max() - in_length
-    indices = indices + sym_len_s - 1
-    return weights, indices, int(sym_len_s), int(sym_len_e)
-
-
-# --------------------------------------------
-# imresize for tensor image [0, 1]
-# --------------------------------------------
-def imresize(img, scale, antialiasing=True):
-    # Now the scale should be the same for H and W
-    # input: img: pytorch tensor, CHW or HW [0,1]
-    # output: CHW or HW [0,1] w/o round
-    need_squeeze = True if img.dim() == 2 else False
-    if need_squeeze:
-        img.unsqueeze_(0)
-    in_C, in_H, in_W = img.size()
-    out_C, out_H, out_W = in_C, math.ceil(in_H * scale), math.ceil(in_W * scale)
-    kernel_width = 4
-    kernel = 'cubic'
-
-    # Return the desired dimension order for performing the resize.  The
-    # strategy is to perform the resize first along the dimension with the
-    # smallest scale factor.
-    # Now we do not support this.
-
-    # get weights and indices
-    weights_H, indices_H, sym_len_Hs, sym_len_He = calculate_weights_indices(
-        in_H, out_H, scale, kernel, kernel_width, antialiasing)
-    weights_W, indices_W, sym_len_Ws, sym_len_We = calculate_weights_indices(
-        in_W, out_W, scale, kernel, kernel_width, antialiasing)
-    # process H dimension
-    # symmetric copying
-    img_aug = torch.FloatTensor(in_C, in_H + sym_len_Hs + sym_len_He, in_W)
-    img_aug.narrow(1, sym_len_Hs, in_H).copy_(img)
-
-    sym_patch = img[:, :sym_len_Hs, :]
-    inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
-    sym_patch_inv = sym_patch.index_select(1, inv_idx)
-    img_aug.narrow(1, 0, sym_len_Hs).copy_(sym_patch_inv)
-
-    sym_patch = img[:, -sym_len_He:, :]
-    inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
-    sym_patch_inv = sym_patch.index_select(1, inv_idx)
-    img_aug.narrow(1, sym_len_Hs + in_H, sym_len_He).copy_(sym_patch_inv)
-
-    out_1 = torch.FloatTensor(in_C, out_H, in_W)
-    kernel_width = weights_H.size(1)
-    for i in range(out_H):
-        idx = int(indices_H[i][0])
-        for j in range(out_C):
-            out_1[j, i, :] = img_aug[j, idx:idx + kernel_width, :].transpose(0, 1).mv(weights_H[i])
-
-    # process W dimension
-    # symmetric copying
-    out_1_aug = torch.FloatTensor(in_C, out_H, in_W + sym_len_Ws + sym_len_We)
-    out_1_aug.narrow(2, sym_len_Ws, in_W).copy_(out_1)
-
-    sym_patch = out_1[:, :, :sym_len_Ws]
-    inv_idx = torch.arange(sym_patch.size(2) - 1, -1, -1).long()
-    sym_patch_inv = sym_patch.index_select(2, inv_idx)
-    out_1_aug.narrow(2, 0, sym_len_Ws).copy_(sym_patch_inv)
-
-    sym_patch = out_1[:, :, -sym_len_We:]
-    inv_idx = torch.arange(sym_patch.size(2) - 1, -1, -1).long()
-    sym_patch_inv = sym_patch.index_select(2, inv_idx)
-    out_1_aug.narrow(2, sym_len_Ws + in_W, sym_len_We).copy_(sym_patch_inv)
-
-    out_2 = torch.FloatTensor(in_C, out_H, out_W)
-    kernel_width = weights_W.size(1)
-    for i in range(out_W):
-        idx = int(indices_W[i][0])
-        for j in range(out_C):
-            out_2[j, :, i] = out_1_aug[j, :, idx:idx + kernel_width].mv(weights_W[i])
-    if need_squeeze:
-        out_2.squeeze_()
-    return out_2
-
-
-# --------------------------------------------
-# imresize for numpy image [0, 1]
-# --------------------------------------------
-def imresize_np(img, scale, antialiasing=True):
-    # Now the scale should be the same for H and W
-    # input: img: Numpy, HWC or HW [0,1]
-    # output: HWC or HW [0,1] w/o round
-    img = torch.from_numpy(img)
-    need_squeeze = True if img.dim() == 2 else False
-    if need_squeeze:
-        img.unsqueeze_(2)
-
-    in_H, in_W, in_C = img.size()
-    out_C, out_H, out_W = in_C, math.ceil(in_H * scale), math.ceil(in_W * scale)
-    kernel_width = 4
-    kernel = 'cubic'
-
-    # Return the desired dimension order for performing the resize.  The
-    # strategy is to perform the resize first along the dimension with the
-    # smallest scale factor.
-    # Now we do not support this.
-
-    # get weights and indices
-    weights_H, indices_H, sym_len_Hs, sym_len_He = calculate_weights_indices(
-        in_H, out_H, scale, kernel, kernel_width, antialiasing)
-    weights_W, indices_W, sym_len_Ws, sym_len_We = calculate_weights_indices(
-        in_W, out_W, scale, kernel, kernel_width, antialiasing)
-    # process H dimension
-    # symmetric copying
-    img_aug = torch.FloatTensor(in_H + sym_len_Hs + sym_len_He, in_W, in_C)
-    img_aug.narrow(0, sym_len_Hs, in_H).copy_(img)
-
-    sym_patch = img[:sym_len_Hs, :, :]
-    inv_idx = torch.arange(sym_patch.size(0) - 1, -1, -1).long()
-    sym_patch_inv = sym_patch.index_select(0, inv_idx)
-    img_aug.narrow(0, 0, sym_len_Hs).copy_(sym_patch_inv)
-
-    sym_patch = img[-sym_len_He:, :, :]
-    inv_idx = torch.arange(sym_patch.size(0) - 1, -1, -1).long()
-    sym_patch_inv = sym_patch.index_select(0, inv_idx)
-    img_aug.narrow(0, sym_len_Hs + in_H, sym_len_He).copy_(sym_patch_inv)
-
-    out_1 = torch.FloatTensor(out_H, in_W, in_C)
-    kernel_width = weights_H.size(1)
-    for i in range(out_H):
-        idx = int(indices_H[i][0])
-        for j in range(out_C):
-            out_1[i, :, j] = img_aug[idx:idx + kernel_width, :, j].transpose(0, 1).mv(weights_H[i])
-
-    # process W dimension
-    # symmetric copying
-    out_1_aug = torch.FloatTensor(out_H, in_W + sym_len_Ws + sym_len_We, in_C)
-    out_1_aug.narrow(1, sym_len_Ws, in_W).copy_(out_1)
-
-    sym_patch = out_1[:, :sym_len_Ws, :]
-    inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
-    sym_patch_inv = sym_patch.index_select(1, inv_idx)
-    out_1_aug.narrow(1, 0, sym_len_Ws).copy_(sym_patch_inv)
-
-    sym_patch = out_1[:, -sym_len_We:, :]
-    inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
-    sym_patch_inv = sym_patch.index_select(1, inv_idx)
-    out_1_aug.narrow(1, sym_len_Ws + in_W, sym_len_We).copy_(sym_patch_inv)
-
-    out_2 = torch.FloatTensor(out_H, out_W, in_C)
-    kernel_width = weights_W.size(1)
-    for i in range(out_W):
-        idx = int(indices_W[i][0])
-        for j in range(out_C):
-            out_2[:, i, j] = out_1_aug[:, idx:idx + kernel_width, j].mv(weights_W[i])
-    if need_squeeze:
-        out_2.squeeze_()
-
-    return out_2.numpy()
-
-
-if __name__ == '__main__':
-    print('---')
-#    img = imread_uint('test.bmp', 3)
-#    img = uint2single(img)
-#    img_bicubic = imresize_np(img, 1/4)
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/losses/__init__.py b/examples/tutorial/stable_diffusion/ldm/modules/losses/__init__.py
deleted file mode 100644
index 876d7c5bd6e3..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/losses/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from ldm.modules.losses.contperceptual import LPIPSWithDiscriminator
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/losses/contperceptual.py b/examples/tutorial/stable_diffusion/ldm/modules/losses/contperceptual.py
deleted file mode 100644
index 672c1e32a138..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/losses/contperceptual.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import torch
-import torch.nn as nn
-
-from taming.modules.losses.vqperceptual import *  # TODO: taming dependency yes/no?
-
-
-class LPIPSWithDiscriminator(nn.Module):
-    def __init__(self, disc_start, logvar_init=0.0, kl_weight=1.0, pixelloss_weight=1.0,
-                 disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
-                 perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
-                 disc_loss="hinge"):
-
-        super().__init__()
-        assert disc_loss in ["hinge", "vanilla"]
-        self.kl_weight = kl_weight
-        self.pixel_weight = pixelloss_weight
-        self.perceptual_loss = LPIPS().eval()
-        self.perceptual_weight = perceptual_weight
-        # output log variance
-        self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init)
-
-        self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels,
-                                                 n_layers=disc_num_layers,
-                                                 use_actnorm=use_actnorm
-                                                 ).apply(weights_init)
-        self.discriminator_iter_start = disc_start
-        self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss
-        self.disc_factor = disc_factor
-        self.discriminator_weight = disc_weight
-        self.disc_conditional = disc_conditional
-
-    def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
-        if last_layer is not None:
-            nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
-            g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
-        else:
-            nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
-            g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
-
-        d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
-        d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
-        d_weight = d_weight * self.discriminator_weight
-        return d_weight
-
-    def forward(self, inputs, reconstructions, posteriors, optimizer_idx,
-                global_step, last_layer=None, cond=None, split="train",
-                weights=None):
-        rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
-        if self.perceptual_weight > 0:
-            p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
-            rec_loss = rec_loss + self.perceptual_weight * p_loss
-
-        nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar
-        weighted_nll_loss = nll_loss
-        if weights is not None:
-            weighted_nll_loss = weights*nll_loss
-        weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
-        nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
-        kl_loss = posteriors.kl()
-        kl_loss = torch.sum(kl_loss) / kl_loss.shape[0]
-
-        # now the GAN part
-        if optimizer_idx == 0:
-            # generator update
-            if cond is None:
-                assert not self.disc_conditional
-                logits_fake = self.discriminator(reconstructions.contiguous())
-            else:
-                assert self.disc_conditional
-                logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1))
-            g_loss = -torch.mean(logits_fake)
-
-            if self.disc_factor > 0.0:
-                try:
-                    d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
-                except RuntimeError:
-                    assert not self.training
-                    d_weight = torch.tensor(0.0)
-            else:
-                d_weight = torch.tensor(0.0)
-
-            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
-            loss = weighted_nll_loss + self.kl_weight * kl_loss + d_weight * disc_factor * g_loss
-
-            log = {"{}/total_loss".format(split): loss.clone().detach().mean(), "{}/logvar".format(split): self.logvar.detach(),
-                   "{}/kl_loss".format(split): kl_loss.detach().mean(), "{}/nll_loss".format(split): nll_loss.detach().mean(),
-                   "{}/rec_loss".format(split): rec_loss.detach().mean(),
-                   "{}/d_weight".format(split): d_weight.detach(),
-                   "{}/disc_factor".format(split): torch.tensor(disc_factor),
-                   "{}/g_loss".format(split): g_loss.detach().mean(),
-                   }
-            return loss, log
-
-        if optimizer_idx == 1:
-            # second pass for discriminator update
-            if cond is None:
-                logits_real = self.discriminator(inputs.contiguous().detach())
-                logits_fake = self.discriminator(reconstructions.contiguous().detach())
-            else:
-                logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1))
-                logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1))
-
-            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
-            d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
-
-            log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(),
-                   "{}/logits_real".format(split): logits_real.detach().mean(),
-                   "{}/logits_fake".format(split): logits_fake.detach().mean()
-                   }
-            return d_loss, log
-
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/losses/vqperceptual.py b/examples/tutorial/stable_diffusion/ldm/modules/losses/vqperceptual.py
deleted file mode 100644
index f69981769e4b..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/losses/vqperceptual.py
+++ /dev/null
@@ -1,167 +0,0 @@
-import torch
-from torch import nn
-import torch.nn.functional as F
-from einops import repeat
-
-from taming.modules.discriminator.model import NLayerDiscriminator, weights_init
-from taming.modules.losses.lpips import LPIPS
-from taming.modules.losses.vqperceptual import hinge_d_loss, vanilla_d_loss
-
-
-def hinge_d_loss_with_exemplar_weights(logits_real, logits_fake, weights):
-    assert weights.shape[0] == logits_real.shape[0] == logits_fake.shape[0]
-    loss_real = torch.mean(F.relu(1. - logits_real), dim=[1,2,3])
-    loss_fake = torch.mean(F.relu(1. + logits_fake), dim=[1,2,3])
-    loss_real = (weights * loss_real).sum() / weights.sum()
-    loss_fake = (weights * loss_fake).sum() / weights.sum()
-    d_loss = 0.5 * (loss_real + loss_fake)
-    return d_loss
-
-def adopt_weight(weight, global_step, threshold=0, value=0.):
-    if global_step < threshold:
-        weight = value
-    return weight
-
-
-def measure_perplexity(predicted_indices, n_embed):
-    # src: https://github.com/karpathy/deep-vector-quantization/blob/main/model.py
-    # eval cluster perplexity. when perplexity == num_embeddings then all clusters are used exactly equally
-    encodings = F.one_hot(predicted_indices, n_embed).float().reshape(-1, n_embed)
-    avg_probs = encodings.mean(0)
-    perplexity = (-(avg_probs * torch.log(avg_probs + 1e-10)).sum()).exp()
-    cluster_use = torch.sum(avg_probs > 0)
-    return perplexity, cluster_use
-
-def l1(x, y):
-    return torch.abs(x-y)
-
-
-def l2(x, y):
-    return torch.pow((x-y), 2)
-
-
-class VQLPIPSWithDiscriminator(nn.Module):
-    def __init__(self, disc_start, codebook_weight=1.0, pixelloss_weight=1.0,
-                 disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
-                 perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
-                 disc_ndf=64, disc_loss="hinge", n_classes=None, perceptual_loss="lpips",
-                 pixel_loss="l1"):
-        super().__init__()
-        assert disc_loss in ["hinge", "vanilla"]
-        assert perceptual_loss in ["lpips", "clips", "dists"]
-        assert pixel_loss in ["l1", "l2"]
-        self.codebook_weight = codebook_weight
-        self.pixel_weight = pixelloss_weight
-        if perceptual_loss == "lpips":
-            print(f"{self.__class__.__name__}: Running with LPIPS.")
-            self.perceptual_loss = LPIPS().eval()
-        else:
-            raise ValueError(f"Unknown perceptual loss: >> {perceptual_loss} <<")
-        self.perceptual_weight = perceptual_weight
-
-        if pixel_loss == "l1":
-            self.pixel_loss = l1
-        else:
-            self.pixel_loss = l2
-
-        self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels,
-                                                 n_layers=disc_num_layers,
-                                                 use_actnorm=use_actnorm,
-                                                 ndf=disc_ndf
-                                                 ).apply(weights_init)
-        self.discriminator_iter_start = disc_start
-        if disc_loss == "hinge":
-            self.disc_loss = hinge_d_loss
-        elif disc_loss == "vanilla":
-            self.disc_loss = vanilla_d_loss
-        else:
-            raise ValueError(f"Unknown GAN loss '{disc_loss}'.")
-        print(f"VQLPIPSWithDiscriminator running with {disc_loss} loss.")
-        self.disc_factor = disc_factor
-        self.discriminator_weight = disc_weight
-        self.disc_conditional = disc_conditional
-        self.n_classes = n_classes
-
-    def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
-        if last_layer is not None:
-            nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
-            g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
-        else:
-            nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
-            g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
-
-        d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
-        d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
-        d_weight = d_weight * self.discriminator_weight
-        return d_weight
-
-    def forward(self, codebook_loss, inputs, reconstructions, optimizer_idx,
-                global_step, last_layer=None, cond=None, split="train", predicted_indices=None):
-        if not exists(codebook_loss):
-            codebook_loss = torch.tensor([0.]).to(inputs.device)
-        #rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
-        rec_loss = self.pixel_loss(inputs.contiguous(), reconstructions.contiguous())
-        if self.perceptual_weight > 0:
-            p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
-            rec_loss = rec_loss + self.perceptual_weight * p_loss
-        else:
-            p_loss = torch.tensor([0.0])
-
-        nll_loss = rec_loss
-        #nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
-        nll_loss = torch.mean(nll_loss)
-
-        # now the GAN part
-        if optimizer_idx == 0:
-            # generator update
-            if cond is None:
-                assert not self.disc_conditional
-                logits_fake = self.discriminator(reconstructions.contiguous())
-            else:
-                assert self.disc_conditional
-                logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1))
-            g_loss = -torch.mean(logits_fake)
-
-            try:
-                d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
-            except RuntimeError:
-                assert not self.training
-                d_weight = torch.tensor(0.0)
-
-            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
-            loss = nll_loss + d_weight * disc_factor * g_loss + self.codebook_weight * codebook_loss.mean()
-
-            log = {"{}/total_loss".format(split): loss.clone().detach().mean(),
-                   "{}/quant_loss".format(split): codebook_loss.detach().mean(),
-                   "{}/nll_loss".format(split): nll_loss.detach().mean(),
-                   "{}/rec_loss".format(split): rec_loss.detach().mean(),
-                   "{}/p_loss".format(split): p_loss.detach().mean(),
-                   "{}/d_weight".format(split): d_weight.detach(),
-                   "{}/disc_factor".format(split): torch.tensor(disc_factor),
-                   "{}/g_loss".format(split): g_loss.detach().mean(),
-                   }
-            if predicted_indices is not None:
-                assert self.n_classes is not None
-                with torch.no_grad():
-                    perplexity, cluster_usage = measure_perplexity(predicted_indices, self.n_classes)
-                log[f"{split}/perplexity"] = perplexity
-                log[f"{split}/cluster_usage"] = cluster_usage
-            return loss, log
-
-        if optimizer_idx == 1:
-            # second pass for discriminator update
-            if cond is None:
-                logits_real = self.discriminator(inputs.contiguous().detach())
-                logits_fake = self.discriminator(reconstructions.contiguous().detach())
-            else:
-                logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1))
-                logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1))
-
-            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
-            d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
-
-            log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(),
-                   "{}/logits_real".format(split): logits_real.detach().mean(),
-                   "{}/logits_fake".format(split): logits_fake.detach().mean()
-                   }
-            return d_loss, log
diff --git a/examples/tutorial/stable_diffusion/ldm/modules/x_transformer.py b/examples/tutorial/stable_diffusion/ldm/modules/x_transformer.py
deleted file mode 100644
index 5fc15bf9cfe0..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/modules/x_transformer.py
+++ /dev/null
@@ -1,641 +0,0 @@
-"""shout-out to https://github.com/lucidrains/x-transformers/tree/main/x_transformers"""
-import torch
-from torch import nn, einsum
-import torch.nn.functional as F
-from functools import partial
-from inspect import isfunction
-from collections import namedtuple
-from einops import rearrange, repeat, reduce
-
-# constants
-
-DEFAULT_DIM_HEAD = 64
-
-Intermediates = namedtuple('Intermediates', [
-    'pre_softmax_attn',
-    'post_softmax_attn'
-])
-
-LayerIntermediates = namedtuple('Intermediates', [
-    'hiddens',
-    'attn_intermediates'
-])
-
-
-class AbsolutePositionalEmbedding(nn.Module):
-    def __init__(self, dim, max_seq_len):
-        super().__init__()
-        self.emb = nn.Embedding(max_seq_len, dim)
-        self.init_()
-
-    def init_(self):
-        nn.init.normal_(self.emb.weight, std=0.02)
-
-    def forward(self, x):
-        n = torch.arange(x.shape[1], device=x.device)
-        return self.emb(n)[None, :, :]
-
-
-class FixedPositionalEmbedding(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        inv_freq = 1. / (10000 ** (torch.arange(0, dim, 2).float() / dim))
-        self.register_buffer('inv_freq', inv_freq)
-
-    def forward(self, x, seq_dim=1, offset=0):
-        t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq) + offset
-        sinusoid_inp = torch.einsum('i , j -> i j', t, self.inv_freq)
-        emb = torch.cat((sinusoid_inp.sin(), sinusoid_inp.cos()), dim=-1)
-        return emb[None, :, :]
-
-
-# helpers
-
-def exists(val):
-    return val is not None
-
-
-def default(val, d):
-    if exists(val):
-        return val
-    return d() if isfunction(d) else d
-
-
-def always(val):
-    def inner(*args, **kwargs):
-        return val
-    return inner
-
-
-def not_equals(val):
-    def inner(x):
-        return x != val
-    return inner
-
-
-def equals(val):
-    def inner(x):
-        return x == val
-    return inner
-
-
-def max_neg_value(tensor):
-    return -torch.finfo(tensor.dtype).max
-
-
-# keyword argument helpers
-
-def pick_and_pop(keys, d):
-    values = list(map(lambda key: d.pop(key), keys))
-    return dict(zip(keys, values))
-
-
-def group_dict_by_key(cond, d):
-    return_val = [dict(), dict()]
-    for key in d.keys():
-        match = bool(cond(key))
-        ind = int(not match)
-        return_val[ind][key] = d[key]
-    return (*return_val,)
-
-
-def string_begins_with(prefix, str):
-    return str.startswith(prefix)
-
-
-def group_by_key_prefix(prefix, d):
-    return group_dict_by_key(partial(string_begins_with, prefix), d)
-
-
-def groupby_prefix_and_trim(prefix, d):
-    kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d)
-    kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix):], x[1]), tuple(kwargs_with_prefix.items())))
-    return kwargs_without_prefix, kwargs
-
-
-# classes
-class Scale(nn.Module):
-    def __init__(self, value, fn):
-        super().__init__()
-        self.value = value
-        self.fn = fn
-
-    def forward(self, x, **kwargs):
-        x, *rest = self.fn(x, **kwargs)
-        return (x * self.value, *rest)
-
-
-class Rezero(nn.Module):
-    def __init__(self, fn):
-        super().__init__()
-        self.fn = fn
-        self.g = nn.Parameter(torch.zeros(1))
-
-    def forward(self, x, **kwargs):
-        x, *rest = self.fn(x, **kwargs)
-        return (x * self.g, *rest)
-
-
-class ScaleNorm(nn.Module):
-    def __init__(self, dim, eps=1e-5):
-        super().__init__()
-        self.scale = dim ** -0.5
-        self.eps = eps
-        self.g = nn.Parameter(torch.ones(1))
-
-    def forward(self, x):
-        norm = torch.norm(x, dim=-1, keepdim=True) * self.scale
-        return x / norm.clamp(min=self.eps) * self.g
-
-
-class RMSNorm(nn.Module):
-    def __init__(self, dim, eps=1e-8):
-        super().__init__()
-        self.scale = dim ** -0.5
-        self.eps = eps
-        self.g = nn.Parameter(torch.ones(dim))
-
-    def forward(self, x):
-        norm = torch.norm(x, dim=-1, keepdim=True) * self.scale
-        return x / norm.clamp(min=self.eps) * self.g
-
-
-class Residual(nn.Module):
-    def forward(self, x, residual):
-        return x + residual
-
-
-class GRUGating(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.gru = nn.GRUCell(dim, dim)
-
-    def forward(self, x, residual):
-        gated_output = self.gru(
-            rearrange(x, 'b n d -> (b n) d'),
-            rearrange(residual, 'b n d -> (b n) d')
-        )
-
-        return gated_output.reshape_as(x)
-
-
-# feedforward
-
-class GEGLU(nn.Module):
-    def __init__(self, dim_in, dim_out):
-        super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out * 2)
-
-    def forward(self, x):
-        x, gate = self.proj(x).chunk(2, dim=-1)
-        return x * F.gelu(gate)
-
-
-class FeedForward(nn.Module):
-    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
-        super().__init__()
-        inner_dim = int(dim * mult)
-        dim_out = default(dim_out, dim)
-        project_in = nn.Sequential(
-            nn.Linear(dim, inner_dim),
-            nn.GELU()
-        ) if not glu else GEGLU(dim, inner_dim)
-
-        self.net = nn.Sequential(
-            project_in,
-            nn.Dropout(dropout),
-            nn.Linear(inner_dim, dim_out)
-        )
-
-    def forward(self, x):
-        return self.net(x)
-
-
-# attention.
-class Attention(nn.Module):
-    def __init__(
-            self,
-            dim,
-            dim_head=DEFAULT_DIM_HEAD,
-            heads=8,
-            causal=False,
-            mask=None,
-            talking_heads=False,
-            sparse_topk=None,
-            use_entmax15=False,
-            num_mem_kv=0,
-            dropout=0.,
-            on_attn=False
-    ):
-        super().__init__()
-        if use_entmax15:
-            raise NotImplementedError("Check out entmax activation instead of softmax activation!")
-        self.scale = dim_head ** -0.5
-        self.heads = heads
-        self.causal = causal
-        self.mask = mask
-
-        inner_dim = dim_head * heads
-
-        self.to_q = nn.Linear(dim, inner_dim, bias=False)
-        self.to_k = nn.Linear(dim, inner_dim, bias=False)
-        self.to_v = nn.Linear(dim, inner_dim, bias=False)
-        self.dropout = nn.Dropout(dropout)
-
-        # talking heads
-        self.talking_heads = talking_heads
-        if talking_heads:
-            self.pre_softmax_proj = nn.Parameter(torch.randn(heads, heads))
-            self.post_softmax_proj = nn.Parameter(torch.randn(heads, heads))
-
-        # explicit topk sparse attention
-        self.sparse_topk = sparse_topk
-
-        # entmax
-        #self.attn_fn = entmax15 if use_entmax15 else F.softmax
-        self.attn_fn = F.softmax
-
-        # add memory key / values
-        self.num_mem_kv = num_mem_kv
-        if num_mem_kv > 0:
-            self.mem_k = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head))
-            self.mem_v = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head))
-
-        # attention on attention
-        self.attn_on_attn = on_attn
-        self.to_out = nn.Sequential(nn.Linear(inner_dim, dim * 2), nn.GLU()) if on_attn else nn.Linear(inner_dim, dim)
-
-    def forward(
-            self,
-            x,
-            context=None,
-            mask=None,
-            context_mask=None,
-            rel_pos=None,
-            sinusoidal_emb=None,
-            prev_attn=None,
-            mem=None
-    ):
-        b, n, _, h, talking_heads, device = *x.shape, self.heads, self.talking_heads, x.device
-        kv_input = default(context, x)
-
-        q_input = x
-        k_input = kv_input
-        v_input = kv_input
-
-        if exists(mem):
-            k_input = torch.cat((mem, k_input), dim=-2)
-            v_input = torch.cat((mem, v_input), dim=-2)
-
-        if exists(sinusoidal_emb):
-            # in shortformer, the query would start at a position offset depending on the past cached memory
-            offset = k_input.shape[-2] - q_input.shape[-2]
-            q_input = q_input + sinusoidal_emb(q_input, offset=offset)
-            k_input = k_input + sinusoidal_emb(k_input)
-
-        q = self.to_q(q_input)
-        k = self.to_k(k_input)
-        v = self.to_v(v_input)
-
-        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), (q, k, v))
-
-        input_mask = None
-        if any(map(exists, (mask, context_mask))):
-            q_mask = default(mask, lambda: torch.ones((b, n), device=device).bool())
-            k_mask = q_mask if not exists(context) else context_mask
-            k_mask = default(k_mask, lambda: torch.ones((b, k.shape[-2]), device=device).bool())
-            q_mask = rearrange(q_mask, 'b i -> b () i ()')
-            k_mask = rearrange(k_mask, 'b j -> b () () j')
-            input_mask = q_mask * k_mask
-
-        if self.num_mem_kv > 0:
-            mem_k, mem_v = map(lambda t: repeat(t, 'h n d -> b h n d', b=b), (self.mem_k, self.mem_v))
-            k = torch.cat((mem_k, k), dim=-2)
-            v = torch.cat((mem_v, v), dim=-2)
-            if exists(input_mask):
-                input_mask = F.pad(input_mask, (self.num_mem_kv, 0), value=True)
-
-        dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale
-        mask_value = max_neg_value(dots)
-
-        if exists(prev_attn):
-            dots = dots + prev_attn
-
-        pre_softmax_attn = dots
-
-        if talking_heads:
-            dots = einsum('b h i j, h k -> b k i j', dots, self.pre_softmax_proj).contiguous()
-
-        if exists(rel_pos):
-            dots = rel_pos(dots)
-
-        if exists(input_mask):
-            dots.masked_fill_(~input_mask, mask_value)
-            del input_mask
-
-        if self.causal:
-            i, j = dots.shape[-2:]
-            r = torch.arange(i, device=device)
-            mask = rearrange(r, 'i -> () () i ()') < rearrange(r, 'j -> () () () j')
-            mask = F.pad(mask, (j - i, 0), value=False)
-            dots.masked_fill_(mask, mask_value)
-            del mask
-
-        if exists(self.sparse_topk) and self.sparse_topk < dots.shape[-1]:
-            top, _ = dots.topk(self.sparse_topk, dim=-1)
-            vk = top[..., -1].unsqueeze(-1).expand_as(dots)
-            mask = dots < vk
-            dots.masked_fill_(mask, mask_value)
-            del mask
-
-        attn = self.attn_fn(dots, dim=-1)
-        post_softmax_attn = attn
-
-        attn = self.dropout(attn)
-
-        if talking_heads:
-            attn = einsum('b h i j, h k -> b k i j', attn, self.post_softmax_proj).contiguous()
-
-        out = einsum('b h i j, b h j d -> b h i d', attn, v)
-        out = rearrange(out, 'b h n d -> b n (h d)')
-
-        intermediates = Intermediates(
-            pre_softmax_attn=pre_softmax_attn,
-            post_softmax_attn=post_softmax_attn
-        )
-
-        return self.to_out(out), intermediates
-
-
-class AttentionLayers(nn.Module):
-    def __init__(
-            self,
-            dim,
-            depth,
-            heads=8,
-            causal=False,
-            cross_attend=False,
-            only_cross=False,
-            use_scalenorm=False,
-            use_rmsnorm=False,
-            use_rezero=False,
-            rel_pos_num_buckets=32,
-            rel_pos_max_distance=128,
-            position_infused_attn=False,
-            custom_layers=None,
-            sandwich_coef=None,
-            par_ratio=None,
-            residual_attn=False,
-            cross_residual_attn=False,
-            macaron=False,
-            pre_norm=True,
-            gate_residual=False,
-            **kwargs
-    ):
-        super().__init__()
-        ff_kwargs, kwargs = groupby_prefix_and_trim('ff_', kwargs)
-        attn_kwargs, _ = groupby_prefix_and_trim('attn_', kwargs)
-
-        dim_head = attn_kwargs.get('dim_head', DEFAULT_DIM_HEAD)
-
-        self.dim = dim
-        self.depth = depth
-        self.layers = nn.ModuleList([])
-
-        self.has_pos_emb = position_infused_attn
-        self.pia_pos_emb = FixedPositionalEmbedding(dim) if position_infused_attn else None
-        self.rotary_pos_emb = always(None)
-
-        assert rel_pos_num_buckets <= rel_pos_max_distance, 'number of relative position buckets must be less than the relative position max distance'
-        self.rel_pos = None
-
-        self.pre_norm = pre_norm
-
-        self.residual_attn = residual_attn
-        self.cross_residual_attn = cross_residual_attn
-
-        norm_class = ScaleNorm if use_scalenorm else nn.LayerNorm
-        norm_class = RMSNorm if use_rmsnorm else norm_class
-        norm_fn = partial(norm_class, dim)
-
-        norm_fn = nn.Identity if use_rezero else norm_fn
-        branch_fn = Rezero if use_rezero else None
-
-        if cross_attend and not only_cross:
-            default_block = ('a', 'c', 'f')
-        elif cross_attend and only_cross:
-            default_block = ('c', 'f')
-        else:
-            default_block = ('a', 'f')
-
-        if macaron:
-            default_block = ('f',) + default_block
-
-        if exists(custom_layers):
-            layer_types = custom_layers
-        elif exists(par_ratio):
-            par_depth = depth * len(default_block)
-            assert 1 < par_ratio <= par_depth, 'par ratio out of range'
-            default_block = tuple(filter(not_equals('f'), default_block))
-            par_attn = par_depth // par_ratio
-            depth_cut = par_depth * 2 // 3  # 2 / 3 attention layer cutoff suggested by PAR paper
-            par_width = (depth_cut + depth_cut // par_attn) // par_attn
-            assert len(default_block) <= par_width, 'default block is too large for par_ratio'
-            par_block = default_block + ('f',) * (par_width - len(default_block))
-            par_head = par_block * par_attn
-            layer_types = par_head + ('f',) * (par_depth - len(par_head))
-        elif exists(sandwich_coef):
-            assert sandwich_coef > 0 and sandwich_coef <= depth, 'sandwich coefficient should be less than the depth'
-            layer_types = ('a',) * sandwich_coef + default_block * (depth - sandwich_coef) + ('f',) * sandwich_coef
-        else:
-            layer_types = default_block * depth
-
-        self.layer_types = layer_types
-        self.num_attn_layers = len(list(filter(equals('a'), layer_types)))
-
-        for layer_type in self.layer_types:
-            if layer_type == 'a':
-                layer = Attention(dim, heads=heads, causal=causal, **attn_kwargs)
-            elif layer_type == 'c':
-                layer = Attention(dim, heads=heads, **attn_kwargs)
-            elif layer_type == 'f':
-                layer = FeedForward(dim, **ff_kwargs)
-                layer = layer if not macaron else Scale(0.5, layer)
-            else:
-                raise Exception(f'invalid layer type {layer_type}')
-
-            if isinstance(layer, Attention) and exists(branch_fn):
-                layer = branch_fn(layer)
-
-            if gate_residual:
-                residual_fn = GRUGating(dim)
-            else:
-                residual_fn = Residual()
-
-            self.layers.append(nn.ModuleList([
-                norm_fn(),
-                layer,
-                residual_fn
-            ]))
-
-    def forward(
-            self,
-            x,
-            context=None,
-            mask=None,
-            context_mask=None,
-            mems=None,
-            return_hiddens=False
-    ):
-        hiddens = []
-        intermediates = []
-        prev_attn = None
-        prev_cross_attn = None
-
-        mems = mems.copy() if exists(mems) else [None] * self.num_attn_layers
-
-        for ind, (layer_type, (norm, block, residual_fn)) in enumerate(zip(self.layer_types, self.layers)):
-            is_last = ind == (len(self.layers) - 1)
-
-            if layer_type == 'a':
-                hiddens.append(x)
-                layer_mem = mems.pop(0)
-
-            residual = x
-
-            if self.pre_norm:
-                x = norm(x)
-
-            if layer_type == 'a':
-                out, inter = block(x, mask=mask, sinusoidal_emb=self.pia_pos_emb, rel_pos=self.rel_pos,
-                                   prev_attn=prev_attn, mem=layer_mem)
-            elif layer_type == 'c':
-                out, inter = block(x, context=context, mask=mask, context_mask=context_mask, prev_attn=prev_cross_attn)
-            elif layer_type == 'f':
-                out = block(x)
-
-            x = residual_fn(out, residual)
-
-            if layer_type in ('a', 'c'):
-                intermediates.append(inter)
-
-            if layer_type == 'a' and self.residual_attn:
-                prev_attn = inter.pre_softmax_attn
-            elif layer_type == 'c' and self.cross_residual_attn:
-                prev_cross_attn = inter.pre_softmax_attn
-
-            if not self.pre_norm and not is_last:
-                x = norm(x)
-
-        if return_hiddens:
-            intermediates = LayerIntermediates(
-                hiddens=hiddens,
-                attn_intermediates=intermediates
-            )
-
-            return x, intermediates
-
-        return x
-
-
-class Encoder(AttentionLayers):
-    def __init__(self, **kwargs):
-        assert 'causal' not in kwargs, 'cannot set causality on encoder'
-        super().__init__(causal=False, **kwargs)
-
-
-
-class TransformerWrapper(nn.Module):
-    def __init__(
-            self,
-            *,
-            num_tokens,
-            max_seq_len,
-            attn_layers,
-            emb_dim=None,
-            max_mem_len=0.,
-            emb_dropout=0.,
-            num_memory_tokens=None,
-            tie_embedding=False,
-            use_pos_emb=True
-    ):
-        super().__init__()
-        assert isinstance(attn_layers, AttentionLayers), 'attention layers must be one of Encoder or Decoder'
-
-        dim = attn_layers.dim
-        emb_dim = default(emb_dim, dim)
-
-        self.max_seq_len = max_seq_len
-        self.max_mem_len = max_mem_len
-        self.num_tokens = num_tokens
-
-        self.token_emb = nn.Embedding(num_tokens, emb_dim)
-        self.pos_emb = AbsolutePositionalEmbedding(emb_dim, max_seq_len) if (
-                    use_pos_emb and not attn_layers.has_pos_emb) else always(0)
-        self.emb_dropout = nn.Dropout(emb_dropout)
-
-        self.project_emb = nn.Linear(emb_dim, dim) if emb_dim != dim else nn.Identity()
-        self.attn_layers = attn_layers
-        self.norm = nn.LayerNorm(dim)
-
-        self.init_()
-
-        self.to_logits = nn.Linear(dim, num_tokens) if not tie_embedding else lambda t: t @ self.token_emb.weight.t()
-
-        # memory tokens (like [cls]) from Memory Transformers paper
-        num_memory_tokens = default(num_memory_tokens, 0)
-        self.num_memory_tokens = num_memory_tokens
-        if num_memory_tokens > 0:
-            self.memory_tokens = nn.Parameter(torch.randn(num_memory_tokens, dim))
-
-            # let funnel encoder know number of memory tokens, if specified
-            if hasattr(attn_layers, 'num_memory_tokens'):
-                attn_layers.num_memory_tokens = num_memory_tokens
-
-    def init_(self):
-        nn.init.normal_(self.token_emb.weight, std=0.02)
-
-    def forward(
-            self,
-            x,
-            return_embeddings=False,
-            mask=None,
-            return_mems=False,
-            return_attn=False,
-            mems=None,
-            **kwargs
-    ):
-        b, n, device, num_mem = *x.shape, x.device, self.num_memory_tokens
-        x = self.token_emb(x)
-        x += self.pos_emb(x)
-        x = self.emb_dropout(x)
-
-        x = self.project_emb(x)
-
-        if num_mem > 0:
-            mem = repeat(self.memory_tokens, 'n d -> b n d', b=b)
-            x = torch.cat((mem, x), dim=1)
-
-            # auto-handle masking after appending memory tokens
-            if exists(mask):
-                mask = F.pad(mask, (num_mem, 0), value=True)
-
-        x, intermediates = self.attn_layers(x, mask=mask, mems=mems, return_hiddens=True, **kwargs)
-        x = self.norm(x)
-
-        mem, x = x[:, :num_mem], x[:, num_mem:]
-
-        out = self.to_logits(x) if not return_embeddings else x
-
-        if return_mems:
-            hiddens = intermediates.hiddens
-            new_mems = list(map(lambda pair: torch.cat(pair, dim=-2), zip(mems, hiddens))) if exists(mems) else hiddens
-            new_mems = list(map(lambda t: t[..., -self.max_mem_len:, :].detach(), new_mems))
-            return out, new_mems
-
-        if return_attn:
-            attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates))
-            return out, attn_maps
-
-        return out
-
diff --git a/examples/tutorial/stable_diffusion/ldm/util.py b/examples/tutorial/stable_diffusion/ldm/util.py
deleted file mode 100644
index 8ba38853e7a0..000000000000
--- a/examples/tutorial/stable_diffusion/ldm/util.py
+++ /dev/null
@@ -1,203 +0,0 @@
-import importlib
-
-import torch
-import numpy as np
-from collections import abc
-from einops import rearrange
-from functools import partial
-
-import multiprocessing as mp
-from threading import Thread
-from queue import Queue
-
-from inspect import isfunction
-from PIL import Image, ImageDraw, ImageFont
-
-
-def log_txt_as_img(wh, xc, size=10):
-    # wh a tuple of (width, height)
-    # xc a list of captions to plot
-    b = len(xc)
-    txts = list()
-    for bi in range(b):
-        txt = Image.new("RGB", wh, color="white")
-        draw = ImageDraw.Draw(txt)
-        font = ImageFont.truetype('data/DejaVuSans.ttf', size=size)
-        nc = int(40 * (wh[0] / 256))
-        lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc))
-
-        try:
-            draw.text((0, 0), lines, fill="black", font=font)
-        except UnicodeEncodeError:
-            print("Cant encode string for logging. Skipping.")
-
-        txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
-        txts.append(txt)
-    txts = np.stack(txts)
-    txts = torch.tensor(txts)
-    return txts
-
-
-def ismap(x):
-    if not isinstance(x, torch.Tensor):
-        return False
-    return (len(x.shape) == 4) and (x.shape[1] > 3)
-
-
-def isimage(x):
-    if not isinstance(x, torch.Tensor):
-        return False
-    return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
-
-
-def exists(x):
-    return x is not None
-
-
-def default(val, d):
-    if exists(val):
-        return val
-    return d() if isfunction(d) else d
-
-
-def mean_flat(tensor):
-    """
-    https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86
-    Take the mean over all non-batch dimensions.
-    """
-    return tensor.mean(dim=list(range(1, len(tensor.shape))))
-
-
-def count_params(model, verbose=False):
-    total_params = sum(p.numel() for p in model.parameters())
-    if verbose:
-        print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.")
-    return total_params
-
-
-def instantiate_from_config(config):
-    if not "target" in config:
-        if config == '__is_first_stage__':
-            return None
-        elif config == "__is_unconditional__":
-            return None
-        raise KeyError("Expected key `target` to instantiate.")
-    return get_obj_from_str(config["target"])(**config.get("params", dict()))
-
-
-def get_obj_from_str(string, reload=False):
-    module, cls = string.rsplit(".", 1)
-    if reload:
-        module_imp = importlib.import_module(module)
-        importlib.reload(module_imp)
-    return getattr(importlib.import_module(module, package=None), cls)
-
-
-def _do_parallel_data_prefetch(func, Q, data, idx, idx_to_fn=False):
-    # create dummy dataset instance
-
-    # run prefetching
-    if idx_to_fn:
-        res = func(data, worker_id=idx)
-    else:
-        res = func(data)
-    Q.put([idx, res])
-    Q.put("Done")
-
-
-def parallel_data_prefetch(
-        func: callable, data, n_proc, target_data_type="ndarray", cpu_intensive=True, use_worker_id=False
-):
-    # if target_data_type not in ["ndarray", "list"]:
-    #     raise ValueError(
-    #         "Data, which is passed to parallel_data_prefetch has to be either of type list or ndarray."
-    #     )
-    if isinstance(data, np.ndarray) and target_data_type == "list":
-        raise ValueError("list expected but function got ndarray.")
-    elif isinstance(data, abc.Iterable):
-        if isinstance(data, dict):
-            print(
-                f'WARNING:"data" argument passed to parallel_data_prefetch is a dict: Using only its values and disregarding keys.'
-            )
-            data = list(data.values())
-        if target_data_type == "ndarray":
-            data = np.asarray(data)
-        else:
-            data = list(data)
-    else:
-        raise TypeError(
-            f"The data, that shall be processed parallel has to be either an np.ndarray or an Iterable, but is actually {type(data)}."
-        )
-
-    if cpu_intensive:
-        Q = mp.Queue(1000)
-        proc = mp.Process
-    else:
-        Q = Queue(1000)
-        proc = Thread
-    # spawn processes
-    if target_data_type == "ndarray":
-        arguments = [
-            [func, Q, part, i, use_worker_id]
-            for i, part in enumerate(np.array_split(data, n_proc))
-        ]
-    else:
-        step = (
-            int(len(data) / n_proc + 1)
-            if len(data) % n_proc != 0
-            else int(len(data) / n_proc)
-        )
-        arguments = [
-            [func, Q, part, i, use_worker_id]
-            for i, part in enumerate(
-                [data[i: i + step] for i in range(0, len(data), step)]
-            )
-        ]
-    processes = []
-    for i in range(n_proc):
-        p = proc(target=_do_parallel_data_prefetch, args=arguments[i])
-        processes += [p]
-
-    # start processes
-    print(f"Start prefetching...")
-    import time
-
-    start = time.time()
-    gather_res = [[] for _ in range(n_proc)]
-    try:
-        for p in processes:
-            p.start()
-
-        k = 0
-        while k < n_proc:
-            # get result
-            res = Q.get()
-            if res == "Done":
-                k += 1
-            else:
-                gather_res[res[0]] = res[1]
-
-    except Exception as e:
-        print("Exception: ", e)
-        for p in processes:
-            p.terminate()
-
-        raise e
-    finally:
-        for p in processes:
-            p.join()
-        print(f"Prefetching complete. [{time.time() - start} sec.]")
-
-    if target_data_type == 'ndarray':
-        if not isinstance(gather_res[0], np.ndarray):
-            return np.concatenate([np.asarray(r) for r in gather_res], axis=0)
-
-        # order outputs
-        return np.concatenate(gather_res, axis=0)
-    elif target_data_type == 'list':
-        out = []
-        for r in gather_res:
-            out.extend(r)
-        return out
-    else:
-        return gather_res
diff --git a/examples/tutorial/stable_diffusion/main.py b/examples/tutorial/stable_diffusion/main.py
deleted file mode 100644
index 7cd00e4c0c26..000000000000
--- a/examples/tutorial/stable_diffusion/main.py
+++ /dev/null
@@ -1,830 +0,0 @@
-import argparse, os, sys, datetime, glob, importlib, csv
-import numpy as np
-import time
-import torch
-import torchvision
-import pytorch_lightning as pl
-
-from packaging import version
-from omegaconf import OmegaConf
-from torch.utils.data import random_split, DataLoader, Dataset, Subset
-from functools import partial
-from PIL import Image
-# from pytorch_lightning.strategies.colossalai import ColossalAIStrategy
-# from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
-from colossalai.nn.optimizer import HybridAdam
-from prefetch_generator import BackgroundGenerator
-
-from pytorch_lightning import seed_everything
-from pytorch_lightning.trainer import Trainer
-from pytorch_lightning.callbacks import ModelCheckpoint, Callback, LearningRateMonitor
-from pytorch_lightning.utilities.rank_zero import rank_zero_only
-from pytorch_lightning.utilities import rank_zero_info
-from diffusers.models.unet_2d import UNet2DModel
-
-from clip.model import Bottleneck
-from transformers.models.clip.modeling_clip import CLIPTextTransformer
-
-from ldm.data.base import Txt2ImgIterableBaseDataset
-from ldm.util import instantiate_from_config
-import clip
-from einops import rearrange, repeat
-from transformers import CLIPTokenizer, CLIPTextModel
-import kornia
-
-from ldm.modules.x_transformer import *
-from ldm.modules.encoders.modules import *
-from taming.modules.diffusionmodules.model import ResnetBlock
-from taming.modules.transformer.mingpt import *
-from taming.modules.transformer.permuter import *
-
-
-from ldm.modules.ema import LitEma
-from ldm.modules.distributions.distributions import normal_kl, DiagonalGaussianDistribution
-from ldm.models.autoencoder import AutoencoderKL
-from ldm.models.autoencoder import *
-from ldm.models.diffusion.ddim import *
-from ldm.modules.diffusionmodules.openaimodel import *
-from ldm.modules.diffusionmodules.model import *
-from ldm.modules.diffusionmodules.model import Decoder, Encoder, Up_module, Down_module, Mid_module, temb_module
-from ldm.modules.attention import enable_flash_attention
-
-class DataLoaderX(DataLoader):
-
-    def __iter__(self):
-        return BackgroundGenerator(super().__iter__())
-
-
-def get_parser(**parser_kwargs):
-    def str2bool(v):
-        if isinstance(v, bool):
-            return v
-        if v.lower() in ("yes", "true", "t", "y", "1"):
-            return True
-        elif v.lower() in ("no", "false", "f", "n", "0"):
-            return False
-        else:
-            raise argparse.ArgumentTypeError("Boolean value expected.")
-
-    parser = argparse.ArgumentParser(**parser_kwargs)
-    parser.add_argument(
-        "-n",
-        "--name",
-        type=str,
-        const=True,
-        default="",
-        nargs="?",
-        help="postfix for logdir",
-    )
-    parser.add_argument(
-        "-r",
-        "--resume",
-        type=str,
-        const=True,
-        default="",
-        nargs="?",
-        help="resume from logdir or checkpoint in logdir",
-    )
-    parser.add_argument(
-        "-b",
-        "--base",
-        nargs="*",
-        metavar="base_config.yaml",
-        help="paths to base configs. Loaded from left-to-right. "
-             "Parameters can be overwritten or added with command-line options of the form `--key value`.",
-        default=list(),
-    )
-    parser.add_argument(
-        "-t",
-        "--train",
-        type=str2bool,
-        const=True,
-        default=False,
-        nargs="?",
-        help="train",
-    )
-    parser.add_argument(
-        "--no-test",
-        type=str2bool,
-        const=True,
-        default=False,
-        nargs="?",
-        help="disable test",
-    )
-    parser.add_argument(
-        "-p",
-        "--project",
-        help="name of new or path to existing project"
-    )
-    parser.add_argument(
-        "-d",
-        "--debug",
-        type=str2bool,
-        nargs="?",
-        const=True,
-        default=False,
-        help="enable post-mortem debugging",
-    )
-    parser.add_argument(
-        "-s",
-        "--seed",
-        type=int,
-        default=23,
-        help="seed for seed_everything",
-    )
-    parser.add_argument(
-        "-f",
-        "--postfix",
-        type=str,
-        default="",
-        help="post-postfix for default name",
-    )
-    parser.add_argument(
-        "-l",
-        "--logdir",
-        type=str,
-        default="logs",
-        help="directory for logging dat shit",
-    )
-    parser.add_argument(
-        "--scale_lr",
-        type=str2bool,
-        nargs="?",
-        const=True,
-        default=True,
-        help="scale base-lr by ngpu * batch_size * n_accumulate",
-    )
-    parser.add_argument(
-        "--use_fp16",
-        type=str2bool,
-        nargs="?",
-        const=True,
-        default=True,
-        help="whether to use fp16",
-    )
-    parser.add_argument(
-        "--flash",
-        type=str2bool,
-        const=True,
-        default=False,
-        nargs="?",
-        help="whether to use flash attention",
-    )
-    return parser
-
-
-def nondefault_trainer_args(opt):
-    parser = argparse.ArgumentParser()
-    parser = Trainer.add_argparse_args(parser)
-    args = parser.parse_args([])
-    return sorted(k for k in vars(args) if getattr(opt, k) != getattr(args, k))
-
-
-class WrappedDataset(Dataset):
-    """Wraps an arbitrary object with __len__ and __getitem__ into a pytorch dataset"""
-
-    def __init__(self, dataset):
-        self.data = dataset
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(self, idx):
-        return self.data[idx]
-
-
-def worker_init_fn(_):
-    worker_info = torch.utils.data.get_worker_info()
-
-    dataset = worker_info.dataset
-    worker_id = worker_info.id
-
-    if isinstance(dataset, Txt2ImgIterableBaseDataset):
-        split_size = dataset.num_records // worker_info.num_workers
-        # reset num_records to the true number to retain reliable length information
-        dataset.sample_ids = dataset.valid_ids[worker_id * split_size:(worker_id + 1) * split_size]
-        current_id = np.random.choice(len(np.random.get_state()[1]), 1)
-        return np.random.seed(np.random.get_state()[1][current_id] + worker_id)
-    else:
-        return np.random.seed(np.random.get_state()[1][0] + worker_id)
-
-
-class DataModuleFromConfig(pl.LightningDataModule):
-    def __init__(self, batch_size, train=None, validation=None, test=None, predict=None,
-                 wrap=False, num_workers=None, shuffle_test_loader=False, use_worker_init_fn=False,
-                 shuffle_val_dataloader=False):
-        super().__init__()
-        self.batch_size = batch_size
-        self.dataset_configs = dict()
-        self.num_workers = num_workers if num_workers is not None else batch_size * 2
-        self.use_worker_init_fn = use_worker_init_fn
-        if train is not None:
-            self.dataset_configs["train"] = train
-            self.train_dataloader = self._train_dataloader
-        if validation is not None:
-            self.dataset_configs["validation"] = validation
-            self.val_dataloader = partial(self._val_dataloader, shuffle=shuffle_val_dataloader)
-        if test is not None:
-            self.dataset_configs["test"] = test
-            self.test_dataloader = partial(self._test_dataloader, shuffle=shuffle_test_loader)
-        if predict is not None:
-            self.dataset_configs["predict"] = predict
-            self.predict_dataloader = self._predict_dataloader
-        self.wrap = wrap
-
-    def prepare_data(self):
-        for data_cfg in self.dataset_configs.values():
-            instantiate_from_config(data_cfg)
-
-    def setup(self, stage=None):
-        self.datasets = dict(
-            (k, instantiate_from_config(self.dataset_configs[k]))
-            for k in self.dataset_configs)
-        if self.wrap:
-            for k in self.datasets:
-                self.datasets[k] = WrappedDataset(self.datasets[k])
-
-    def _train_dataloader(self):
-        is_iterable_dataset = isinstance(self.datasets['train'], Txt2ImgIterableBaseDataset)
-        if is_iterable_dataset or self.use_worker_init_fn:
-            init_fn = worker_init_fn
-        else:
-            init_fn = None
-        return DataLoaderX(self.datasets["train"], batch_size=self.batch_size,
-                          num_workers=self.num_workers, shuffle=False if is_iterable_dataset else True,
-                          worker_init_fn=init_fn)
-
-    def _val_dataloader(self, shuffle=False):
-        if isinstance(self.datasets['validation'], Txt2ImgIterableBaseDataset) or self.use_worker_init_fn:
-            init_fn = worker_init_fn
-        else:
-            init_fn = None
-        return DataLoaderX(self.datasets["validation"],
-                          batch_size=self.batch_size,
-                          num_workers=self.num_workers,
-                          worker_init_fn=init_fn,
-                          shuffle=shuffle)
-
-    def _test_dataloader(self, shuffle=False):
-        is_iterable_dataset = isinstance(self.datasets['train'], Txt2ImgIterableBaseDataset)
-        if is_iterable_dataset or self.use_worker_init_fn:
-            init_fn = worker_init_fn
-        else:
-            init_fn = None
-
-        # do not shuffle dataloader for iterable dataset
-        shuffle = shuffle and (not is_iterable_dataset)
-
-        return DataLoaderX(self.datasets["test"], batch_size=self.batch_size,
-                          num_workers=self.num_workers, worker_init_fn=init_fn, shuffle=shuffle)
-
-    def _predict_dataloader(self, shuffle=False):
-        if isinstance(self.datasets['predict'], Txt2ImgIterableBaseDataset) or self.use_worker_init_fn:
-            init_fn = worker_init_fn
-        else:
-            init_fn = None
-        return DataLoaderX(self.datasets["predict"], batch_size=self.batch_size,
-                          num_workers=self.num_workers, worker_init_fn=init_fn)
-
-
-class SetupCallback(Callback):
-    def __init__(self, resume, now, logdir, ckptdir, cfgdir, config, lightning_config):
-        super().__init__()
-        self.resume = resume
-        self.now = now
-        self.logdir = logdir
-        self.ckptdir = ckptdir
-        self.cfgdir = cfgdir
-        self.config = config
-        self.lightning_config = lightning_config
-
-    def on_keyboard_interrupt(self, trainer, pl_module):
-        if trainer.global_rank == 0:
-            print("Summoning checkpoint.")
-            ckpt_path = os.path.join(self.ckptdir, "last.ckpt")
-            trainer.save_checkpoint(ckpt_path)
-
-    # def on_pretrain_routine_start(self, trainer, pl_module):
-    def on_fit_start(self, trainer, pl_module):
-        if trainer.global_rank == 0:
-            # Create logdirs and save configs
-            os.makedirs(self.logdir, exist_ok=True)
-            os.makedirs(self.ckptdir, exist_ok=True)
-            os.makedirs(self.cfgdir, exist_ok=True)
-
-            if "callbacks" in self.lightning_config:
-                if 'metrics_over_trainsteps_checkpoint' in self.lightning_config['callbacks']:
-                    os.makedirs(os.path.join(self.ckptdir, 'trainstep_checkpoints'), exist_ok=True)
-            print("Project config")
-            print(OmegaConf.to_yaml(self.config))
-            OmegaConf.save(self.config,
-                           os.path.join(self.cfgdir, "{}-project.yaml".format(self.now)))
-
-            print("Lightning config")
-            print(OmegaConf.to_yaml(self.lightning_config))
-            OmegaConf.save(OmegaConf.create({"lightning": self.lightning_config}),
-                           os.path.join(self.cfgdir, "{}-lightning.yaml".format(self.now)))
-
-        else:
-            # ModelCheckpoint callback created log directory --- remove it
-            if not self.resume and os.path.exists(self.logdir):
-                dst, name = os.path.split(self.logdir)
-                dst = os.path.join(dst, "child_runs", name)
-                os.makedirs(os.path.split(dst)[0], exist_ok=True)
-                try:
-                    os.rename(self.logdir, dst)
-                except FileNotFoundError:
-                    pass
-
-
-class ImageLogger(Callback):
-    def __init__(self, batch_frequency, max_images, clamp=True, increase_log_steps=True,
-                 rescale=True, disabled=False, log_on_batch_idx=False, log_first_step=False,
-                 log_images_kwargs=None):
-        super().__init__()
-        self.rescale = rescale
-        self.batch_freq = batch_frequency
-        self.max_images = max_images
-        self.logger_log_images = {
-            pl.loggers.CSVLogger: self._testtube,
-        }
-        self.log_steps = [2 ** n for n in range(int(np.log2(self.batch_freq)) + 1)]
-        if not increase_log_steps:
-            self.log_steps = [self.batch_freq]
-        self.clamp = clamp
-        self.disabled = disabled
-        self.log_on_batch_idx = log_on_batch_idx
-        self.log_images_kwargs = log_images_kwargs if log_images_kwargs else {}
-        self.log_first_step = log_first_step
-
-    @rank_zero_only
-    def _testtube(self, pl_module, images, batch_idx, split):
-        for k in images:
-            grid = torchvision.utils.make_grid(images[k])
-            grid = (grid + 1.0) / 2.0  # -1,1 -> 0,1; c,h,w
-
-            tag = f"{split}/{k}"
-            pl_module.logger.experiment.add_image(
-                tag, grid,
-                global_step=pl_module.global_step)
-
-    @rank_zero_only
-    def log_local(self, save_dir, split, images,
-                  global_step, current_epoch, batch_idx):
-        root = os.path.join(save_dir, "images", split)
-        for k in images:
-            grid = torchvision.utils.make_grid(images[k], nrow=4)
-            if self.rescale:
-                grid = (grid + 1.0) / 2.0  # -1,1 -> 0,1; c,h,w
-            grid = grid.transpose(0, 1).transpose(1, 2).squeeze(-1)
-            grid = grid.numpy()
-            grid = (grid * 255).astype(np.uint8)
-            filename = "{}_gs-{:06}_e-{:06}_b-{:06}.png".format(
-                k,
-                global_step,
-                current_epoch,
-                batch_idx)
-            path = os.path.join(root, filename)
-            os.makedirs(os.path.split(path)[0], exist_ok=True)
-            Image.fromarray(grid).save(path)
-
-    def log_img(self, pl_module, batch, batch_idx, split="train"):
-        check_idx = batch_idx if self.log_on_batch_idx else pl_module.global_step
-        if (self.check_frequency(check_idx) and  # batch_idx % self.batch_freq == 0
-                hasattr(pl_module, "log_images") and
-                callable(pl_module.log_images) and
-                self.max_images > 0):
-            logger = type(pl_module.logger)
-
-            is_train = pl_module.training
-            if is_train:
-                pl_module.eval()
-
-            with torch.no_grad():
-                images = pl_module.log_images(batch, split=split, **self.log_images_kwargs)
-
-            for k in images:
-                N = min(images[k].shape[0], self.max_images)
-                images[k] = images[k][:N]
-                if isinstance(images[k], torch.Tensor):
-                    images[k] = images[k].detach().cpu()
-                    if self.clamp:
-                        images[k] = torch.clamp(images[k], -1., 1.)
-
-            self.log_local(pl_module.logger.save_dir, split, images,
-                           pl_module.global_step, pl_module.current_epoch, batch_idx)
-
-            logger_log_images = self.logger_log_images.get(logger, lambda *args, **kwargs: None)
-            logger_log_images(pl_module, images, pl_module.global_step, split)
-
-            if is_train:
-                pl_module.train()
-
-    def check_frequency(self, check_idx):
-        if ((check_idx % self.batch_freq) == 0 or (check_idx in self.log_steps)) and (
-                check_idx > 0 or self.log_first_step):
-            try:
-                self.log_steps.pop(0)
-            except IndexError as e:
-                print(e)
-                pass
-            return True
-        return False
-
-    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
-        # if not self.disabled and (pl_module.global_step > 0 or self.log_first_step):
-        #     self.log_img(pl_module, batch, batch_idx, split="train")
-        pass
-
-    def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
-        if not self.disabled and pl_module.global_step > 0:
-            self.log_img(pl_module, batch, batch_idx, split="val")
-        if hasattr(pl_module, 'calibrate_grad_norm'):
-            if (pl_module.calibrate_grad_norm and batch_idx % 25 == 0) and batch_idx > 0:
-                self.log_gradients(trainer, pl_module, batch_idx=batch_idx)
-
-
-class CUDACallback(Callback):
-    # see https://github.com/SeanNaren/minGPT/blob/master/mingpt/callback.py
-
-    def on_train_start(self, trainer, pl_module):
-        rank_zero_info("Training is starting")
-
-    def on_train_end(self, trainer, pl_module):
-        rank_zero_info("Training is ending")
-
-    def on_train_epoch_start(self, trainer, pl_module):
-        # Reset the memory use counter
-        torch.cuda.reset_peak_memory_stats(trainer.strategy.root_device.index)
-        torch.cuda.synchronize(trainer.strategy.root_device.index)
-        self.start_time = time.time()
-
-    def on_train_epoch_end(self, trainer, pl_module):
-        torch.cuda.synchronize(trainer.strategy.root_device.index)
-        max_memory = torch.cuda.max_memory_allocated(trainer.strategy.root_device.index) / 2 ** 20
-        epoch_time = time.time() - self.start_time
-
-        try:
-            max_memory = trainer.strategy.reduce(max_memory)
-            epoch_time = trainer.strategy.reduce(epoch_time)
-
-            rank_zero_info(f"Average Epoch time: {epoch_time:.2f} seconds")
-            rank_zero_info(f"Average Peak memory {max_memory:.2f}MiB")
-        except AttributeError:
-            pass
-
-
-if __name__ == "__main__":
-    # custom parser to specify config files, train, test and debug mode,
-    # postfix, resume.
-    # `--key value` arguments are interpreted as arguments to the trainer.
-    # `nested.key=value` arguments are interpreted as config parameters.
-    # configs are merged from left-to-right followed by command line parameters.
-
-    # model:
-    #   base_learning_rate: float
-    #   target: path to lightning module
-    #   params:
-    #       key: value
-    # data:
-    #   target: main.DataModuleFromConfig
-    #   params:
-    #      batch_size: int
-    #      wrap: bool
-    #      train:
-    #          target: path to train dataset
-    #          params:
-    #              key: value
-    #      validation:
-    #          target: path to validation dataset
-    #          params:
-    #              key: value
-    #      test:
-    #          target: path to test dataset
-    #          params:
-    #              key: value
-    # lightning: (optional, has sane defaults and can be specified on cmdline)
-    #   trainer:
-    #       additional arguments to trainer
-    #   logger:
-    #       logger to instantiate
-    #   modelcheckpoint:
-    #       modelcheckpoint to instantiate
-    #   callbacks:
-    #       callback1:
-    #           target: importpath
-    #           params:
-    #               key: value
-
-    now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
-
-    # add cwd for convenience and to make classes in this file available when
-    # running as `python main.py`
-    # (in particular `main.DataModuleFromConfig`)
-    sys.path.append(os.getcwd())
-
-    parser = get_parser()
-    parser = Trainer.add_argparse_args(parser)
-
-    opt, unknown = parser.parse_known_args()
-    if opt.name and opt.resume:
-        raise ValueError(
-            "-n/--name and -r/--resume cannot be specified both."
-            "If you want to resume training in a new log folder, "
-            "use -n/--name in combination with --resume_from_checkpoint"
-        )
-    if opt.flash:
-        enable_flash_attention()
-    if opt.resume:
-        if not os.path.exists(opt.resume):
-            raise ValueError("Cannot find {}".format(opt.resume))
-        if os.path.isfile(opt.resume):
-            paths = opt.resume.split("/")
-            # idx = len(paths)-paths[::-1].index("logs")+1
-            # logdir = "/".join(paths[:idx])
-            logdir = "/".join(paths[:-2])
-            ckpt = opt.resume
-        else:
-            assert os.path.isdir(opt.resume), opt.resume
-            logdir = opt.resume.rstrip("/")
-            ckpt = os.path.join(logdir, "checkpoints", "last.ckpt")
-
-        opt.resume_from_checkpoint = ckpt
-        base_configs = sorted(glob.glob(os.path.join(logdir, "configs/*.yaml")))
-        opt.base = base_configs + opt.base
-        _tmp = logdir.split("/")
-        nowname = _tmp[-1]
-    else:
-        if opt.name:
-            name = "_" + opt.name
-        elif opt.base:
-            cfg_fname = os.path.split(opt.base[0])[-1]
-            cfg_name = os.path.splitext(cfg_fname)[0]
-            name = "_" + cfg_name
-        else:
-            name = ""
-        nowname = now + name + opt.postfix
-        logdir = os.path.join(opt.logdir, nowname)
-
-    ckptdir = os.path.join(logdir, "checkpoints")
-    cfgdir = os.path.join(logdir, "configs")
-    seed_everything(opt.seed)
-
-    try:
-        # init and save configs
-        configs = [OmegaConf.load(cfg) for cfg in opt.base]
-        cli = OmegaConf.from_dotlist(unknown)
-        config = OmegaConf.merge(*configs, cli)
-        lightning_config = config.pop("lightning", OmegaConf.create())
-        # merge trainer cli with config
-        trainer_config = lightning_config.get("trainer", OmegaConf.create())
-  
-        for k in nondefault_trainer_args(opt):
-            trainer_config[k] = getattr(opt, k)
-
-        print(trainer_config)
-        if not trainer_config["accelerator"] == "gpu":
-            del trainer_config["accelerator"]
-            cpu = True
-            print("Running on CPU")
-        else:
-            cpu = False
-            print("Running on GPU")
-        trainer_opt = argparse.Namespace(**trainer_config)
-        lightning_config.trainer = trainer_config
-
-        # model
-        use_fp16 = trainer_config.get("precision", 32) == 16
-        if use_fp16:
-            config.model["params"].update({"use_fp16": True})
-            print("Using FP16 = {}".format(config.model["params"]["use_fp16"]))
-        else:
-            config.model["params"].update({"use_fp16": False})
-            print("Using FP16 = {}".format(config.model["params"]["use_fp16"]))
-        
-        model = instantiate_from_config(config.model)
-        # trainer and callbacks
-        trainer_kwargs = dict()
-
-        # config the logger
-        # default logger configs
-        default_logger_cfgs = {
-            "wandb": {
-                "target": "pytorch_lightning.loggers.WandbLogger",
-                "params": {
-                    "name": nowname,
-                    "save_dir": logdir,
-                    "offline": opt.debug,
-                    "id": nowname,
-                }
-            },
-            "tensorboard":{
-                "target": "pytorch_lightning.loggers.TensorBoardLogger",
-                "params":{
-                    "save_dir": logdir,
-                    "name": "diff_tb",
-                    "log_graph": True
-                }
-            }
-        }
-
-        default_logger_cfg = default_logger_cfgs["tensorboard"]
-        if "logger" in lightning_config:
-            logger_cfg = lightning_config.logger
-        else:
-            logger_cfg = default_logger_cfg
-        logger_cfg = OmegaConf.merge(default_logger_cfg, logger_cfg)
-        trainer_kwargs["logger"] = instantiate_from_config(logger_cfg)
-
-        # config the strategy, defualt is ddp
-        if "strategy" in trainer_config:
-            strategy_cfg = trainer_config["strategy"]
-            print("Using strategy: {}".format(strategy_cfg["target"]))
-        else:
-            strategy_cfg = {
-                "target": "pytorch_lightning.strategies.DDPStrategy",
-                "params": {
-                    "find_unused_parameters": False
-                }
-            }
-            print("Using strategy: DDPStrategy")
-
-        trainer_kwargs["strategy"] = instantiate_from_config(strategy_cfg)
-
-        # modelcheckpoint - use TrainResult/EvalResult(checkpoint_on=metric) to
-        # specify which metric is used to determine best models
-        default_modelckpt_cfg = {
-            "target": "pytorch_lightning.callbacks.ModelCheckpoint",
-            "params": {
-                "dirpath": ckptdir,
-                "filename": "{epoch:06}",
-                "verbose": True,
-                "save_last": True,
-            }
-        }
-        if hasattr(model, "monitor"):
-            print(f"Monitoring {model.monitor} as checkpoint metric.")
-            default_modelckpt_cfg["params"]["monitor"] = model.monitor
-            default_modelckpt_cfg["params"]["save_top_k"] = 3
-
-        if "modelcheckpoint" in lightning_config:
-            modelckpt_cfg = lightning_config.modelcheckpoint
-        else:
-            modelckpt_cfg =  OmegaConf.create()
-        modelckpt_cfg = OmegaConf.merge(default_modelckpt_cfg, modelckpt_cfg)
-        print(f"Merged modelckpt-cfg: \n{modelckpt_cfg}")
-        if version.parse(pl.__version__) < version.parse('1.4.0'):
-            trainer_kwargs["checkpoint_callback"] = instantiate_from_config(modelckpt_cfg)
-
-        # add callback which sets up log directory
-        default_callbacks_cfg = {
-            "setup_callback": {
-                "target": "main.SetupCallback",
-                "params": {
-                    "resume": opt.resume,
-                    "now": now,
-                    "logdir": logdir,
-                    "ckptdir": ckptdir,
-                    "cfgdir": cfgdir,
-                    "config": config,
-                    "lightning_config": lightning_config,
-                }
-            },
-            "image_logger": {
-                "target": "main.ImageLogger",
-                "params": {
-                    "batch_frequency": 750,
-                    "max_images": 4,
-                    "clamp": True
-                }
-            },
-            "learning_rate_logger": {
-                "target": "main.LearningRateMonitor",
-                "params": {
-                    "logging_interval": "step",
-                    # "log_momentum": True
-                }
-            },
-            "cuda_callback": {
-                "target": "main.CUDACallback"
-            },
-        }
-        if version.parse(pl.__version__) >= version.parse('1.4.0'):
-            default_callbacks_cfg.update({'checkpoint_callback': modelckpt_cfg})
-
-        if "callbacks" in lightning_config:
-            callbacks_cfg = lightning_config.callbacks
-        else:
-            callbacks_cfg = OmegaConf.create()
-
-        if 'metrics_over_trainsteps_checkpoint' in callbacks_cfg:
-            print(
-                'Caution: Saving checkpoints every n train steps without deleting. This might require some free space.')
-            default_metrics_over_trainsteps_ckpt_dict = {
-                'metrics_over_trainsteps_checkpoint':
-                    {"target": 'pytorch_lightning.callbacks.ModelCheckpoint',
-                     'params': {
-                         "dirpath": os.path.join(ckptdir, 'trainstep_checkpoints'),
-                         "filename": "{epoch:06}-{step:09}",
-                         "verbose": True,
-                         'save_top_k': -1,
-                         'every_n_train_steps': 10000,
-                         'save_weights_only': True
-                     }
-                     }
-            }
-            default_callbacks_cfg.update(default_metrics_over_trainsteps_ckpt_dict)
-
-        callbacks_cfg = OmegaConf.merge(default_callbacks_cfg, callbacks_cfg)
-        if 'ignore_keys_callback' in callbacks_cfg and hasattr(trainer_opt, 'resume_from_checkpoint'):
-            callbacks_cfg.ignore_keys_callback.params['ckpt_path'] = trainer_opt.resume_from_checkpoint
-        elif 'ignore_keys_callback' in callbacks_cfg:
-            del callbacks_cfg['ignore_keys_callback']
-
-        trainer_kwargs["callbacks"] = [instantiate_from_config(callbacks_cfg[k]) for k in callbacks_cfg]
-
-        trainer = Trainer.from_argparse_args(trainer_opt, **trainer_kwargs)
-        trainer.logdir = logdir  ###
-
-        # data
-        data = instantiate_from_config(config.data)
-        # NOTE according to https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
-        # calling these ourselves should not be necessary but it is.
-        # lightning still takes care of proper multiprocessing though
-        data.prepare_data()
-        data.setup()
-        print("#### Data #####")
-        for k in data.datasets:
-            print(f"{k}, {data.datasets[k].__class__.__name__}, {len(data.datasets[k])}")
-
-        # configure learning rate
-        bs, base_lr = config.data.params.batch_size, config.model.base_learning_rate
-        if not cpu:
-            ngpu = trainer_config["devices"]
-        else:
-            ngpu = 1
-        if 'accumulate_grad_batches' in lightning_config.trainer:
-            accumulate_grad_batches = lightning_config.trainer.accumulate_grad_batches
-        else:
-            accumulate_grad_batches = 1
-        print(f"accumulate_grad_batches = {accumulate_grad_batches}")
-        lightning_config.trainer.accumulate_grad_batches = accumulate_grad_batches
-        if opt.scale_lr:
-            model.learning_rate = accumulate_grad_batches * ngpu * bs * base_lr
-            print(
-                "Setting learning rate to {:.2e} = {} (accumulate_grad_batches) * {} (num_gpus) * {} (batchsize) * {:.2e} (base_lr)".format(
-                    model.learning_rate, accumulate_grad_batches, ngpu, bs, base_lr))
-        else:
-            model.learning_rate = base_lr
-            print("++++ NOT USING LR SCALING ++++")
-            print(f"Setting learning rate to {model.learning_rate:.2e}")
-
-
-        # allow checkpointing via USR1
-        def melk(*args, **kwargs):
-            # run all checkpoint hooks
-            if trainer.global_rank == 0:
-                print("Summoning checkpoint.")
-                ckpt_path = os.path.join(ckptdir, "last.ckpt")
-                trainer.save_checkpoint(ckpt_path)
-
-
-        def divein(*args, **kwargs):
-            if trainer.global_rank == 0:
-                import pudb;
-                pudb.set_trace()
-
-
-        import signal
-
-        signal.signal(signal.SIGUSR1, melk)
-        signal.signal(signal.SIGUSR2, divein)
-
-        # run
-        if opt.train:
-            try:
-                for name, m in model.named_parameters():
-                    print(name)
-                trainer.fit(model, data)
-            except Exception:
-                melk()
-                raise
-        # if not opt.no_test and not trainer.interrupted:
-        #     trainer.test(model, data)
-    except Exception:
-        if opt.debug and trainer.global_rank == 0:
-            try:
-                import pudb as debugger
-            except ImportError:
-                import pdb as debugger
-            debugger.post_mortem()
-        raise
-    finally:
-        # move newly created debug project to debug_runs
-        if opt.debug and not opt.resume and trainer.global_rank == 0:
-            dst, name = os.path.split(logdir)
-            dst = os.path.join(dst, "debug_runs", name)
-            os.makedirs(os.path.split(dst)[0], exist_ok=True)
-            os.rename(logdir, dst)
-        if trainer.global_rank == 0:
-            print(trainer.profiler.summary())
diff --git a/examples/tutorial/stable_diffusion/requirements.txt b/examples/tutorial/stable_diffusion/requirements.txt
deleted file mode 100644
index a57003562a3b..000000000000
--- a/examples/tutorial/stable_diffusion/requirements.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-albumentations==0.4.3
-diffusers
-pudb==2019.2
-datasets
-invisible-watermark
-imageio==2.9.0
-imageio-ffmpeg==0.4.2
-omegaconf==2.1.1
-multiprocess
-test-tube>=0.7.5
-streamlit>=0.73.1
-einops==0.3.0
-torch-fidelity==0.3.0
-transformers==4.19.2
-torchmetrics==0.6.0
-kornia==0.6
-opencv-python==4.6.0.66
-prefetch_generator
-colossalai
--e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
--e git+https://github.com/openai/CLIP.git@main#egg=clip
--e .
diff --git a/examples/tutorial/stable_diffusion/scripts/download_first_stages.sh b/examples/tutorial/stable_diffusion/scripts/download_first_stages.sh
deleted file mode 100644
index a8d79e99ccdf..000000000000
--- a/examples/tutorial/stable_diffusion/scripts/download_first_stages.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/bash
-wget -O models/first_stage_models/kl-f4/model.zip https://ommer-lab.com/files/latent-diffusion/kl-f4.zip
-wget -O models/first_stage_models/kl-f8/model.zip https://ommer-lab.com/files/latent-diffusion/kl-f8.zip
-wget -O models/first_stage_models/kl-f16/model.zip https://ommer-lab.com/files/latent-diffusion/kl-f16.zip
-wget -O models/first_stage_models/kl-f32/model.zip https://ommer-lab.com/files/latent-diffusion/kl-f32.zip
-wget -O models/first_stage_models/vq-f4/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f4.zip
-wget -O models/first_stage_models/vq-f4-noattn/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f4-noattn.zip
-wget -O models/first_stage_models/vq-f8/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f8.zip
-wget -O models/first_stage_models/vq-f8-n256/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f8-n256.zip
-wget -O models/first_stage_models/vq-f16/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f16.zip
-
-
-
-cd models/first_stage_models/kl-f4
-unzip -o model.zip
-
-cd ../kl-f8
-unzip -o model.zip
-
-cd ../kl-f16
-unzip -o model.zip
-
-cd ../kl-f32
-unzip -o model.zip
-
-cd ../vq-f4
-unzip -o model.zip
-
-cd ../vq-f4-noattn
-unzip -o model.zip
-
-cd ../vq-f8
-unzip -o model.zip
-
-cd ../vq-f8-n256
-unzip -o model.zip
-
-cd ../vq-f16
-unzip -o model.zip
-
-cd ../..
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/scripts/download_models.sh b/examples/tutorial/stable_diffusion/scripts/download_models.sh
deleted file mode 100644
index 84297d7b8b9a..000000000000
--- a/examples/tutorial/stable_diffusion/scripts/download_models.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-wget -O models/ldm/celeba256/celeba-256.zip https://ommer-lab.com/files/latent-diffusion/celeba.zip
-wget -O models/ldm/ffhq256/ffhq-256.zip https://ommer-lab.com/files/latent-diffusion/ffhq.zip
-wget -O models/ldm/lsun_churches256/lsun_churches-256.zip https://ommer-lab.com/files/latent-diffusion/lsun_churches.zip
-wget -O models/ldm/lsun_beds256/lsun_beds-256.zip https://ommer-lab.com/files/latent-diffusion/lsun_bedrooms.zip
-wget -O models/ldm/text2img256/model.zip https://ommer-lab.com/files/latent-diffusion/text2img.zip
-wget -O models/ldm/cin256/model.zip https://ommer-lab.com/files/latent-diffusion/cin.zip
-wget -O models/ldm/semantic_synthesis512/model.zip https://ommer-lab.com/files/latent-diffusion/semantic_synthesis.zip
-wget -O models/ldm/semantic_synthesis256/model.zip https://ommer-lab.com/files/latent-diffusion/semantic_synthesis256.zip
-wget -O models/ldm/bsr_sr/model.zip https://ommer-lab.com/files/latent-diffusion/sr_bsr.zip
-wget -O models/ldm/layout2img-openimages256/model.zip https://ommer-lab.com/files/latent-diffusion/layout2img_model.zip
-wget -O models/ldm/inpainting_big/model.zip https://ommer-lab.com/files/latent-diffusion/inpainting_big.zip
-
-
-
-cd models/ldm/celeba256
-unzip -o celeba-256.zip
-
-cd ../ffhq256
-unzip -o ffhq-256.zip
-
-cd ../lsun_churches256
-unzip -o lsun_churches-256.zip
-
-cd ../lsun_beds256
-unzip -o lsun_beds-256.zip
-
-cd ../text2img256
-unzip -o model.zip
-
-cd ../cin256
-unzip -o model.zip
-
-cd ../semantic_synthesis512
-unzip -o model.zip
-
-cd ../semantic_synthesis256
-unzip -o model.zip
-
-cd ../bsr_sr
-unzip -o model.zip
-
-cd ../layout2img-openimages256
-unzip -o model.zip
-
-cd ../inpainting_big
-unzip -o model.zip
-
-cd ../..
diff --git a/examples/tutorial/stable_diffusion/scripts/img2img.py b/examples/tutorial/stable_diffusion/scripts/img2img.py
deleted file mode 100644
index 421e2151d9e9..000000000000
--- a/examples/tutorial/stable_diffusion/scripts/img2img.py
+++ /dev/null
@@ -1,293 +0,0 @@
-"""make variations of input image"""
-
-import argparse, os, sys, glob
-import PIL
-import torch
-import numpy as np
-from omegaconf import OmegaConf
-from PIL import Image
-from tqdm import tqdm, trange
-from itertools import islice
-from einops import rearrange, repeat
-from torchvision.utils import make_grid
-from torch import autocast
-from contextlib import nullcontext
-import time
-from pytorch_lightning import seed_everything
-
-from ldm.util import instantiate_from_config
-from ldm.models.diffusion.ddim import DDIMSampler
-from ldm.models.diffusion.plms import PLMSSampler
-
-
-def chunk(it, size):
-    it = iter(it)
-    return iter(lambda: tuple(islice(it, size)), ())
-
-
-def load_model_from_config(config, ckpt, verbose=False):
-    print(f"Loading model from {ckpt}")
-    pl_sd = torch.load(ckpt, map_location="cpu")
-    if "global_step" in pl_sd:
-        print(f"Global Step: {pl_sd['global_step']}")
-    sd = pl_sd["state_dict"]
-    model = instantiate_from_config(config.model)
-    m, u = model.load_state_dict(sd, strict=False)
-    if len(m) > 0 and verbose:
-        print("missing keys:")
-        print(m)
-    if len(u) > 0 and verbose:
-        print("unexpected keys:")
-        print(u)
-
-    model.cuda()
-    model.eval()
-    return model
-
-
-def load_img(path):
-    image = Image.open(path).convert("RGB")
-    w, h = image.size
-    print(f"loaded input image of size ({w}, {h}) from {path}")
-    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
-    image = np.array(image).astype(np.float32) / 255.0
-    image = image[None].transpose(0, 3, 1, 2)
-    image = torch.from_numpy(image)
-    return 2.*image - 1.
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--prompt",
-        type=str,
-        nargs="?",
-        default="a painting of a virus monster playing guitar",
-        help="the prompt to render"
-    )
-
-    parser.add_argument(
-        "--init-img",
-        type=str,
-        nargs="?",
-        help="path to the input image"
-    )
-
-    parser.add_argument(
-        "--outdir",
-        type=str,
-        nargs="?",
-        help="dir to write results to",
-        default="outputs/img2img-samples"
-    )
-
-    parser.add_argument(
-        "--skip_grid",
-        action='store_true',
-        help="do not save a grid, only individual samples. Helpful when evaluating lots of samples",
-    )
-
-    parser.add_argument(
-        "--skip_save",
-        action='store_true',
-        help="do not save indiviual samples. For speed measurements.",
-    )
-
-    parser.add_argument(
-        "--ddim_steps",
-        type=int,
-        default=50,
-        help="number of ddim sampling steps",
-    )
-
-    parser.add_argument(
-        "--plms",
-        action='store_true',
-        help="use plms sampling",
-    )
-    parser.add_argument(
-        "--fixed_code",
-        action='store_true',
-        help="if enabled, uses the same starting code across all samples ",
-    )
-
-    parser.add_argument(
-        "--ddim_eta",
-        type=float,
-        default=0.0,
-        help="ddim eta (eta=0.0 corresponds to deterministic sampling",
-    )
-    parser.add_argument(
-        "--n_iter",
-        type=int,
-        default=1,
-        help="sample this often",
-    )
-    parser.add_argument(
-        "--C",
-        type=int,
-        default=4,
-        help="latent channels",
-    )
-    parser.add_argument(
-        "--f",
-        type=int,
-        default=8,
-        help="downsampling factor, most often 8 or 16",
-    )
-    parser.add_argument(
-        "--n_samples",
-        type=int,
-        default=2,
-        help="how many samples to produce for each given prompt. A.k.a batch size",
-    )
-    parser.add_argument(
-        "--n_rows",
-        type=int,
-        default=0,
-        help="rows in the grid (default: n_samples)",
-    )
-    parser.add_argument(
-        "--scale",
-        type=float,
-        default=5.0,
-        help="unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))",
-    )
-
-    parser.add_argument(
-        "--strength",
-        type=float,
-        default=0.75,
-        help="strength for noising/unnoising. 1.0 corresponds to full destruction of information in init image",
-    )
-    parser.add_argument(
-        "--from-file",
-        type=str,
-        help="if specified, load prompts from this file",
-    )
-    parser.add_argument(
-        "--config",
-        type=str,
-        default="configs/stable-diffusion/v1-inference.yaml",
-        help="path to config which constructs model",
-    )
-    parser.add_argument(
-        "--ckpt",
-        type=str,
-        default="models/ldm/stable-diffusion-v1/model.ckpt",
-        help="path to checkpoint of model",
-    )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=42,
-        help="the seed (for reproducible sampling)",
-    )
-    parser.add_argument(
-        "--precision",
-        type=str,
-        help="evaluate at this precision",
-        choices=["full", "autocast"],
-        default="autocast"
-    )
-
-    opt = parser.parse_args()
-    seed_everything(opt.seed)
-
-    config = OmegaConf.load(f"{opt.config}")
-    model = load_model_from_config(config, f"{opt.ckpt}")
-
-    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-    model = model.to(device)
-
-    if opt.plms:
-        raise NotImplementedError("PLMS sampler not (yet) supported")
-        sampler = PLMSSampler(model)
-    else:
-        sampler = DDIMSampler(model)
-
-    os.makedirs(opt.outdir, exist_ok=True)
-    outpath = opt.outdir
-
-    batch_size = opt.n_samples
-    n_rows = opt.n_rows if opt.n_rows > 0 else batch_size
-    if not opt.from_file:
-        prompt = opt.prompt
-        assert prompt is not None
-        data = [batch_size * [prompt]]
-
-    else:
-        print(f"reading prompts from {opt.from_file}")
-        with open(opt.from_file, "r") as f:
-            data = f.read().splitlines()
-            data = list(chunk(data, batch_size))
-
-    sample_path = os.path.join(outpath, "samples")
-    os.makedirs(sample_path, exist_ok=True)
-    base_count = len(os.listdir(sample_path))
-    grid_count = len(os.listdir(outpath)) - 1
-
-    assert os.path.isfile(opt.init_img)
-    init_image = load_img(opt.init_img).to(device)
-    init_image = repeat(init_image, '1 ... -> b ...', b=batch_size)
-    init_latent = model.get_first_stage_encoding(model.encode_first_stage(init_image))  # move to latent space
-
-    sampler.make_schedule(ddim_num_steps=opt.ddim_steps, ddim_eta=opt.ddim_eta, verbose=False)
-
-    assert 0. <= opt.strength <= 1., 'can only work with strength in [0.0, 1.0]'
-    t_enc = int(opt.strength * opt.ddim_steps)
-    print(f"target t_enc is {t_enc} steps")
-
-    precision_scope = autocast if opt.precision == "autocast" else nullcontext
-    with torch.no_grad():
-        with precision_scope("cuda"):
-            with model.ema_scope():
-                tic = time.time()
-                all_samples = list()
-                for n in trange(opt.n_iter, desc="Sampling"):
-                    for prompts in tqdm(data, desc="data"):
-                        uc = None
-                        if opt.scale != 1.0:
-                            uc = model.get_learned_conditioning(batch_size * [""])
-                        if isinstance(prompts, tuple):
-                            prompts = list(prompts)
-                        c = model.get_learned_conditioning(prompts)
-
-                        # encode (scaled latent)
-                        z_enc = sampler.stochastic_encode(init_latent, torch.tensor([t_enc]*batch_size).to(device))
-                        # decode it
-                        samples = sampler.decode(z_enc, c, t_enc, unconditional_guidance_scale=opt.scale,
-                                                 unconditional_conditioning=uc,)
-
-                        x_samples = model.decode_first_stage(samples)
-                        x_samples = torch.clamp((x_samples + 1.0) / 2.0, min=0.0, max=1.0)
-
-                        if not opt.skip_save:
-                            for x_sample in x_samples:
-                                x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
-                                Image.fromarray(x_sample.astype(np.uint8)).save(
-                                    os.path.join(sample_path, f"{base_count:05}.png"))
-                                base_count += 1
-                        all_samples.append(x_samples)
-
-                if not opt.skip_grid:
-                    # additionally, save as grid
-                    grid = torch.stack(all_samples, 0)
-                    grid = rearrange(grid, 'n b c h w -> (n b) c h w')
-                    grid = make_grid(grid, nrow=n_rows)
-
-                    # to image
-                    grid = 255. * rearrange(grid, 'c h w -> h w c').cpu().numpy()
-                    Image.fromarray(grid.astype(np.uint8)).save(os.path.join(outpath, f'grid-{grid_count:04}.png'))
-                    grid_count += 1
-
-                toc = time.time()
-
-    print(f"Your samples are ready and waiting for you here: \n{outpath} \n"
-          f" \nEnjoy.")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/tutorial/stable_diffusion/scripts/inpaint.py b/examples/tutorial/stable_diffusion/scripts/inpaint.py
deleted file mode 100644
index d6e6387a9a3b..000000000000
--- a/examples/tutorial/stable_diffusion/scripts/inpaint.py
+++ /dev/null
@@ -1,98 +0,0 @@
-import argparse, os, sys, glob
-from omegaconf import OmegaConf
-from PIL import Image
-from tqdm import tqdm
-import numpy as np
-import torch
-from main import instantiate_from_config
-from ldm.models.diffusion.ddim import DDIMSampler
-
-
-def make_batch(image, mask, device):
-    image = np.array(Image.open(image).convert("RGB"))
-    image = image.astype(np.float32)/255.0
-    image = image[None].transpose(0,3,1,2)
-    image = torch.from_numpy(image)
-
-    mask = np.array(Image.open(mask).convert("L"))
-    mask = mask.astype(np.float32)/255.0
-    mask = mask[None,None]
-    mask[mask < 0.5] = 0
-    mask[mask >= 0.5] = 1
-    mask = torch.from_numpy(mask)
-
-    masked_image = (1-mask)*image
-
-    batch = {"image": image, "mask": mask, "masked_image": masked_image}
-    for k in batch:
-        batch[k] = batch[k].to(device=device)
-        batch[k] = batch[k]*2.0-1.0
-    return batch
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--indir",
-        type=str,
-        nargs="?",
-        help="dir containing image-mask pairs (`example.png` and `example_mask.png`)",
-    )
-    parser.add_argument(
-        "--outdir",
-        type=str,
-        nargs="?",
-        help="dir to write results to",
-    )
-    parser.add_argument(
-        "--steps",
-        type=int,
-        default=50,
-        help="number of ddim sampling steps",
-    )
-    opt = parser.parse_args()
-
-    masks = sorted(glob.glob(os.path.join(opt.indir, "*_mask.png")))
-    images = [x.replace("_mask.png", ".png") for x in masks]
-    print(f"Found {len(masks)} inputs.")
-
-    config = OmegaConf.load("models/ldm/inpainting_big/config.yaml")
-    model = instantiate_from_config(config.model)
-    model.load_state_dict(torch.load("models/ldm/inpainting_big/last.ckpt")["state_dict"],
-                          strict=False)
-
-    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-    model = model.to(device)
-    sampler = DDIMSampler(model)
-
-    os.makedirs(opt.outdir, exist_ok=True)
-    with torch.no_grad():
-        with model.ema_scope():
-            for image, mask in tqdm(zip(images, masks)):
-                outpath = os.path.join(opt.outdir, os.path.split(image)[1])
-                batch = make_batch(image, mask, device=device)
-
-                # encode masked image and concat downsampled mask
-                c = model.cond_stage_model.encode(batch["masked_image"])
-                cc = torch.nn.functional.interpolate(batch["mask"],
-                                                     size=c.shape[-2:])
-                c = torch.cat((c, cc), dim=1)
-
-                shape = (c.shape[1]-1,)+c.shape[2:]
-                samples_ddim, _ = sampler.sample(S=opt.steps,
-                                                 conditioning=c,
-                                                 batch_size=c.shape[0],
-                                                 shape=shape,
-                                                 verbose=False)
-                x_samples_ddim = model.decode_first_stage(samples_ddim)
-
-                image = torch.clamp((batch["image"]+1.0)/2.0,
-                                    min=0.0, max=1.0)
-                mask = torch.clamp((batch["mask"]+1.0)/2.0,
-                                   min=0.0, max=1.0)
-                predicted_image = torch.clamp((x_samples_ddim+1.0)/2.0,
-                                              min=0.0, max=1.0)
-
-                inpainted = (1-mask)*image+mask*predicted_image
-                inpainted = inpainted.cpu().numpy().transpose(0,2,3,1)[0]*255
-                Image.fromarray(inpainted.astype(np.uint8)).save(outpath)
diff --git a/examples/tutorial/stable_diffusion/scripts/knn2img.py b/examples/tutorial/stable_diffusion/scripts/knn2img.py
deleted file mode 100644
index e6eaaecab53e..000000000000
--- a/examples/tutorial/stable_diffusion/scripts/knn2img.py
+++ /dev/null
@@ -1,398 +0,0 @@
-import argparse, os, sys, glob
-import clip
-import torch
-import torch.nn as nn
-import numpy as np
-from omegaconf import OmegaConf
-from PIL import Image
-from tqdm import tqdm, trange
-from itertools import islice
-from einops import rearrange, repeat
-from torchvision.utils import make_grid
-import scann
-import time
-from multiprocessing import cpu_count
-
-from ldm.util import instantiate_from_config, parallel_data_prefetch
-from ldm.models.diffusion.ddim import DDIMSampler
-from ldm.models.diffusion.plms import PLMSSampler
-from ldm.modules.encoders.modules import FrozenClipImageEmbedder, FrozenCLIPTextEmbedder
-
-DATABASES = [
-    "openimages",
-    "artbench-art_nouveau",
-    "artbench-baroque",
-    "artbench-expressionism",
-    "artbench-impressionism",
-    "artbench-post_impressionism",
-    "artbench-realism",
-    "artbench-romanticism",
-    "artbench-renaissance",
-    "artbench-surrealism",
-    "artbench-ukiyo_e",
-]
-
-
-def chunk(it, size):
-    it = iter(it)
-    return iter(lambda: tuple(islice(it, size)), ())
-
-
-def load_model_from_config(config, ckpt, verbose=False):
-    print(f"Loading model from {ckpt}")
-    pl_sd = torch.load(ckpt, map_location="cpu")
-    if "global_step" in pl_sd:
-        print(f"Global Step: {pl_sd['global_step']}")
-    sd = pl_sd["state_dict"]
-    model = instantiate_from_config(config.model)
-    m, u = model.load_state_dict(sd, strict=False)
-    if len(m) > 0 and verbose:
-        print("missing keys:")
-        print(m)
-    if len(u) > 0 and verbose:
-        print("unexpected keys:")
-        print(u)
-
-    model.cuda()
-    model.eval()
-    return model
-
-
-class Searcher(object):
-    def __init__(self, database, retriever_version='ViT-L/14'):
-        assert database in DATABASES
-        # self.database = self.load_database(database)
-        self.database_name = database
-        self.searcher_savedir = f'data/rdm/searchers/{self.database_name}'
-        self.database_path = f'data/rdm/retrieval_databases/{self.database_name}'
-        self.retriever = self.load_retriever(version=retriever_version)
-        self.database = {'embedding': [],
-                         'img_id': [],
-                         'patch_coords': []}
-        self.load_database()
-        self.load_searcher()
-
-    def train_searcher(self, k,
-                       metric='dot_product',
-                       searcher_savedir=None):
-
-        print('Start training searcher')
-        searcher = scann.scann_ops_pybind.builder(self.database['embedding'] /
-                                                  np.linalg.norm(self.database['embedding'], axis=1)[:, np.newaxis],
-                                                  k, metric)
-        self.searcher = searcher.score_brute_force().build()
-        print('Finish training searcher')
-
-        if searcher_savedir is not None:
-            print(f'Save trained searcher under "{searcher_savedir}"')
-            os.makedirs(searcher_savedir, exist_ok=True)
-            self.searcher.serialize(searcher_savedir)
-
-    def load_single_file(self, saved_embeddings):
-        compressed = np.load(saved_embeddings)
-        self.database = {key: compressed[key] for key in compressed.files}
-        print('Finished loading of clip embeddings.')
-
-    def load_multi_files(self, data_archive):
-        out_data = {key: [] for key in self.database}
-        for d in tqdm(data_archive, desc=f'Loading datapool from {len(data_archive)} individual files.'):
-            for key in d.files:
-                out_data[key].append(d[key])
-
-        return out_data
-
-    def load_database(self):
-
-        print(f'Load saved patch embedding from "{self.database_path}"')
-        file_content = glob.glob(os.path.join(self.database_path, '*.npz'))
-
-        if len(file_content) == 1:
-            self.load_single_file(file_content[0])
-        elif len(file_content) > 1:
-            data = [np.load(f) for f in file_content]
-            prefetched_data = parallel_data_prefetch(self.load_multi_files, data,
-                                                     n_proc=min(len(data), cpu_count()), target_data_type='dict')
-
-            self.database = {key: np.concatenate([od[key] for od in prefetched_data], axis=1)[0] for key in
-                             self.database}
-        else:
-            raise ValueError(f'No npz-files in specified path "{self.database_path}" is this directory existing?')
-
-        print(f'Finished loading of retrieval database of length {self.database["embedding"].shape[0]}.')
-
-    def load_retriever(self, version='ViT-L/14', ):
-        model = FrozenClipImageEmbedder(model=version)
-        if torch.cuda.is_available():
-            model.cuda()
-        model.eval()
-        return model
-
-    def load_searcher(self):
-        print(f'load searcher for database {self.database_name} from {self.searcher_savedir}')
-        self.searcher = scann.scann_ops_pybind.load_searcher(self.searcher_savedir)
-        print('Finished loading searcher.')
-
-    def search(self, x, k):
-        if self.searcher is None and self.database['embedding'].shape[0] < 2e4:
-            self.train_searcher(k)   # quickly fit searcher on the fly for small databases
-        assert self.searcher is not None, 'Cannot search with uninitialized searcher'
-        if isinstance(x, torch.Tensor):
-            x = x.detach().cpu().numpy()
-        if len(x.shape) == 3:
-            x = x[:, 0]
-        query_embeddings = x / np.linalg.norm(x, axis=1)[:, np.newaxis]
-
-        start = time.time()
-        nns, distances = self.searcher.search_batched(query_embeddings, final_num_neighbors=k)
-        end = time.time()
-
-        out_embeddings = self.database['embedding'][nns]
-        out_img_ids = self.database['img_id'][nns]
-        out_pc = self.database['patch_coords'][nns]
-
-        out = {'nn_embeddings': out_embeddings / np.linalg.norm(out_embeddings, axis=-1)[..., np.newaxis],
-               'img_ids': out_img_ids,
-               'patch_coords': out_pc,
-               'queries': x,
-               'exec_time': end - start,
-               'nns': nns,
-               'q_embeddings': query_embeddings}
-
-        return out
-
-    def __call__(self, x, n):
-        return self.search(x, n)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # TODO: add n_neighbors and modes (text-only, text-image-retrieval, image-image retrieval etc)
-    # TODO: add 'image variation' mode when knn=0 but a single image is given instead of a text prompt?
-    parser.add_argument(
-        "--prompt",
-        type=str,
-        nargs="?",
-        default="a painting of a virus monster playing guitar",
-        help="the prompt to render"
-    )
-
-    parser.add_argument(
-        "--outdir",
-        type=str,
-        nargs="?",
-        help="dir to write results to",
-        default="outputs/txt2img-samples"
-    )
-
-    parser.add_argument(
-        "--skip_grid",
-        action='store_true',
-        help="do not save a grid, only individual samples. Helpful when evaluating lots of samples",
-    )
-
-    parser.add_argument(
-        "--ddim_steps",
-        type=int,
-        default=50,
-        help="number of ddim sampling steps",
-    )
-
-    parser.add_argument(
-        "--n_repeat",
-        type=int,
-        default=1,
-        help="number of repeats in CLIP latent space",
-    )
-
-    parser.add_argument(
-        "--plms",
-        action='store_true',
-        help="use plms sampling",
-    )
-
-    parser.add_argument(
-        "--ddim_eta",
-        type=float,
-        default=0.0,
-        help="ddim eta (eta=0.0 corresponds to deterministic sampling",
-    )
-    parser.add_argument(
-        "--n_iter",
-        type=int,
-        default=1,
-        help="sample this often",
-    )
-
-    parser.add_argument(
-        "--H",
-        type=int,
-        default=768,
-        help="image height, in pixel space",
-    )
-
-    parser.add_argument(
-        "--W",
-        type=int,
-        default=768,
-        help="image width, in pixel space",
-    )
-
-    parser.add_argument(
-        "--n_samples",
-        type=int,
-        default=3,
-        help="how many samples to produce for each given prompt. A.k.a batch size",
-    )
-
-    parser.add_argument(
-        "--n_rows",
-        type=int,
-        default=0,
-        help="rows in the grid (default: n_samples)",
-    )
-
-    parser.add_argument(
-        "--scale",
-        type=float,
-        default=5.0,
-        help="unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))",
-    )
-
-    parser.add_argument(
-        "--from-file",
-        type=str,
-        help="if specified, load prompts from this file",
-    )
-
-    parser.add_argument(
-        "--config",
-        type=str,
-        default="configs/retrieval-augmented-diffusion/768x768.yaml",
-        help="path to config which constructs model",
-    )
-
-    parser.add_argument(
-        "--ckpt",
-        type=str,
-        default="models/rdm/rdm768x768/model.ckpt",
-        help="path to checkpoint of model",
-    )
-
-    parser.add_argument(
-        "--clip_type",
-        type=str,
-        default="ViT-L/14",
-        help="which CLIP model to use for retrieval and NN encoding",
-    )
-    parser.add_argument(
-        "--database",
-        type=str,
-        default='artbench-surrealism',
-        choices=DATABASES,
-        help="The database used for the search, only applied when --use_neighbors=True",
-    )
-    parser.add_argument(
-        "--use_neighbors",
-        default=False,
-        action='store_true',
-        help="Include neighbors in addition to text prompt for conditioning",
-    )
-    parser.add_argument(
-        "--knn",
-        default=10,
-        type=int,
-        help="The number of included neighbors, only applied when --use_neighbors=True",
-    )
-
-    opt = parser.parse_args()
-
-    config = OmegaConf.load(f"{opt.config}")
-    model = load_model_from_config(config, f"{opt.ckpt}")
-
-    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-    model = model.to(device)
-
-    clip_text_encoder = FrozenCLIPTextEmbedder(opt.clip_type).to(device)
-
-    if opt.plms:
-        sampler = PLMSSampler(model)
-    else:
-        sampler = DDIMSampler(model)
-
-    os.makedirs(opt.outdir, exist_ok=True)
-    outpath = opt.outdir
-
-    batch_size = opt.n_samples
-    n_rows = opt.n_rows if opt.n_rows > 0 else batch_size
-    if not opt.from_file:
-        prompt = opt.prompt
-        assert prompt is not None
-        data = [batch_size * [prompt]]
-
-    else:
-        print(f"reading prompts from {opt.from_file}")
-        with open(opt.from_file, "r") as f:
-            data = f.read().splitlines()
-            data = list(chunk(data, batch_size))
-
-    sample_path = os.path.join(outpath, "samples")
-    os.makedirs(sample_path, exist_ok=True)
-    base_count = len(os.listdir(sample_path))
-    grid_count = len(os.listdir(outpath)) - 1
-
-    print(f"sampling scale for cfg is {opt.scale:.2f}")
-
-    searcher = None
-    if opt.use_neighbors:
-        searcher = Searcher(opt.database)
-
-    with torch.no_grad():
-        with model.ema_scope():
-            for n in trange(opt.n_iter, desc="Sampling"):
-                all_samples = list()
-                for prompts in tqdm(data, desc="data"):
-                    print("sampling prompts:", prompts)
-                    if isinstance(prompts, tuple):
-                        prompts = list(prompts)
-                    c = clip_text_encoder.encode(prompts)
-                    uc = None
-                    if searcher is not None:
-                        nn_dict = searcher(c, opt.knn)
-                        c = torch.cat([c, torch.from_numpy(nn_dict['nn_embeddings']).cuda()], dim=1)
-                    if opt.scale != 1.0:
-                        uc = torch.zeros_like(c)
-                    if isinstance(prompts, tuple):
-                        prompts = list(prompts)
-                    shape = [16, opt.H // 16, opt.W // 16]  # note: currently hardcoded for f16 model
-                    samples_ddim, _ = sampler.sample(S=opt.ddim_steps,
-                                                     conditioning=c,
-                                                     batch_size=c.shape[0],
-                                                     shape=shape,
-                                                     verbose=False,
-                                                     unconditional_guidance_scale=opt.scale,
-                                                     unconditional_conditioning=uc,
-                                                     eta=opt.ddim_eta,
-                                                     )
-
-                    x_samples_ddim = model.decode_first_stage(samples_ddim)
-                    x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-
-                    for x_sample in x_samples_ddim:
-                        x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
-                        Image.fromarray(x_sample.astype(np.uint8)).save(
-                            os.path.join(sample_path, f"{base_count:05}.png"))
-                        base_count += 1
-                    all_samples.append(x_samples_ddim)
-
-                if not opt.skip_grid:
-                    # additionally, save as grid
-                    grid = torch.stack(all_samples, 0)
-                    grid = rearrange(grid, 'n b c h w -> (n b) c h w')
-                    grid = make_grid(grid, nrow=n_rows)
-
-                    # to image
-                    grid = 255. * rearrange(grid, 'c h w -> h w c').cpu().numpy()
-                    Image.fromarray(grid.astype(np.uint8)).save(os.path.join(outpath, f'grid-{grid_count:04}.png'))
-                    grid_count += 1
-
-    print(f"Your samples are ready and waiting for you here: \n{outpath} \nEnjoy.")
diff --git a/examples/tutorial/stable_diffusion/scripts/sample_diffusion.py b/examples/tutorial/stable_diffusion/scripts/sample_diffusion.py
deleted file mode 100644
index 876fe3c3642f..000000000000
--- a/examples/tutorial/stable_diffusion/scripts/sample_diffusion.py
+++ /dev/null
@@ -1,313 +0,0 @@
-import argparse, os, sys, glob, datetime, yaml
-import torch
-import time
-import numpy as np
-from tqdm import trange
-
-from omegaconf import OmegaConf
-from PIL import Image
-
-from ldm.models.diffusion.ddim import DDIMSampler
-from ldm.util import instantiate_from_config
-
-rescale = lambda x: (x + 1.) / 2.
-
-def custom_to_pil(x):
-    x = x.detach().cpu()
-    x = torch.clamp(x, -1., 1.)
-    x = (x + 1.) / 2.
-    x = x.permute(1, 2, 0).numpy()
-    x = (255 * x).astype(np.uint8)
-    x = Image.fromarray(x)
-    if not x.mode == "RGB":
-        x = x.convert("RGB")
-    return x
-
-
-def custom_to_np(x):
-    # saves the batch in adm style as in https://github.com/openai/guided-diffusion/blob/main/scripts/image_sample.py
-    sample = x.detach().cpu()
-    sample = ((sample + 1) * 127.5).clamp(0, 255).to(torch.uint8)
-    sample = sample.permute(0, 2, 3, 1)
-    sample = sample.contiguous()
-    return sample
-
-
-def logs2pil(logs, keys=["sample"]):
-    imgs = dict()
-    for k in logs:
-        try:
-            if len(logs[k].shape) == 4:
-                img = custom_to_pil(logs[k][0, ...])
-            elif len(logs[k].shape) == 3:
-                img = custom_to_pil(logs[k])
-            else:
-                print(f"Unknown format for key {k}. ")
-                img = None
-        except:
-            img = None
-        imgs[k] = img
-    return imgs
-
-
-@torch.no_grad()
-def convsample(model, shape, return_intermediates=True,
-               verbose=True,
-               make_prog_row=False):
-
-
-    if not make_prog_row:
-        return model.p_sample_loop(None, shape,
-                                   return_intermediates=return_intermediates, verbose=verbose)
-    else:
-        return model.progressive_denoising(
-            None, shape, verbose=True
-        )
-
-
-@torch.no_grad()
-def convsample_ddim(model, steps, shape, eta=1.0
-                    ):
-    ddim = DDIMSampler(model)
-    bs = shape[0]
-    shape = shape[1:]
-    samples, intermediates = ddim.sample(steps, batch_size=bs, shape=shape, eta=eta, verbose=False,)
-    return samples, intermediates
-
-
-@torch.no_grad()
-def make_convolutional_sample(model, batch_size, vanilla=False, custom_steps=None, eta=1.0,):
-
-
-    log = dict()
-
-    shape = [batch_size,
-             model.model.diffusion_model.in_channels,
-             model.model.diffusion_model.image_size,
-             model.model.diffusion_model.image_size]
-
-    with model.ema_scope("Plotting"):
-        t0 = time.time()
-        if vanilla:
-            sample, progrow = convsample(model, shape,
-                                         make_prog_row=True)
-        else:
-            sample, intermediates = convsample_ddim(model,  steps=custom_steps, shape=shape,
-                                                    eta=eta)
-
-        t1 = time.time()
-
-    x_sample = model.decode_first_stage(sample)
-
-    log["sample"] = x_sample
-    log["time"] = t1 - t0
-    log['throughput'] = sample.shape[0] / (t1 - t0)
-    print(f'Throughput for this batch: {log["throughput"]}')
-    return log
-
-def run(model, logdir, batch_size=50, vanilla=False, custom_steps=None, eta=None, n_samples=50000, nplog=None):
-    if vanilla:
-        print(f'Using Vanilla DDPM sampling with {model.num_timesteps} sampling steps.')
-    else:
-        print(f'Using DDIM sampling with {custom_steps} sampling steps and eta={eta}')
-
-
-    tstart = time.time()
-    n_saved = len(glob.glob(os.path.join(logdir,'*.png')))-1
-    # path = logdir
-    if model.cond_stage_model is None:
-        all_images = []
-
-        print(f"Running unconditional sampling for {n_samples} samples")
-        for _ in trange(n_samples // batch_size, desc="Sampling Batches (unconditional)"):
-            logs = make_convolutional_sample(model, batch_size=batch_size,
-                                             vanilla=vanilla, custom_steps=custom_steps,
-                                             eta=eta)
-            n_saved = save_logs(logs, logdir, n_saved=n_saved, key="sample")
-            all_images.extend([custom_to_np(logs["sample"])])
-            if n_saved >= n_samples:
-                print(f'Finish after generating {n_saved} samples')
-                break
-        all_img = np.concatenate(all_images, axis=0)
-        all_img = all_img[:n_samples]
-        shape_str = "x".join([str(x) for x in all_img.shape])
-        nppath = os.path.join(nplog, f"{shape_str}-samples.npz")
-        np.savez(nppath, all_img)
-
-    else:
-       raise NotImplementedError('Currently only sampling for unconditional models supported.')
-
-    print(f"sampling of {n_saved} images finished in {(time.time() - tstart) / 60.:.2f} minutes.")
-
-
-def save_logs(logs, path, n_saved=0, key="sample", np_path=None):
-    for k in logs:
-        if k == key:
-            batch = logs[key]
-            if np_path is None:
-                for x in batch:
-                    img = custom_to_pil(x)
-                    imgpath = os.path.join(path, f"{key}_{n_saved:06}.png")
-                    img.save(imgpath)
-                    n_saved += 1
-            else:
-                npbatch = custom_to_np(batch)
-                shape_str = "x".join([str(x) for x in npbatch.shape])
-                nppath = os.path.join(np_path, f"{n_saved}-{shape_str}-samples.npz")
-                np.savez(nppath, npbatch)
-                n_saved += npbatch.shape[0]
-    return n_saved
-
-
-def get_parser():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-r",
-        "--resume",
-        type=str,
-        nargs="?",
-        help="load from logdir or checkpoint in logdir",
-    )
-    parser.add_argument(
-        "-n",
-        "--n_samples",
-        type=int,
-        nargs="?",
-        help="number of samples to draw",
-        default=50000
-    )
-    parser.add_argument(
-        "-e",
-        "--eta",
-        type=float,
-        nargs="?",
-        help="eta for ddim sampling (0.0 yields deterministic sampling)",
-        default=1.0
-    )
-    parser.add_argument(
-        "-v",
-        "--vanilla_sample",
-        default=False,
-        action='store_true',
-        help="vanilla sampling (default option is DDIM sampling)?",
-    )
-    parser.add_argument(
-        "-l",
-        "--logdir",
-        type=str,
-        nargs="?",
-        help="extra logdir",
-        default="none"
-    )
-    parser.add_argument(
-        "-c",
-        "--custom_steps",
-        type=int,
-        nargs="?",
-        help="number of steps for ddim and fastdpm sampling",
-        default=50
-    )
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        nargs="?",
-        help="the bs",
-        default=10
-    )
-    return parser
-
-
-def load_model_from_config(config, sd):
-    model = instantiate_from_config(config)
-    model.load_state_dict(sd,strict=False)
-    model.cuda()
-    model.eval()
-    return model
-
-
-def load_model(config, ckpt, gpu, eval_mode):
-    if ckpt:
-        print(f"Loading model from {ckpt}")
-        pl_sd = torch.load(ckpt, map_location="cpu")
-        global_step = pl_sd["global_step"]
-    else:
-        pl_sd = {"state_dict": None}
-        global_step = None
-    model = load_model_from_config(config.model,
-                                   pl_sd["state_dict"])
-
-    return model, global_step
-
-
-if __name__ == "__main__":
-    now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
-    sys.path.append(os.getcwd())
-    command = " ".join(sys.argv)
-
-    parser = get_parser()
-    opt, unknown = parser.parse_known_args()
-    ckpt = None
-
-    if not os.path.exists(opt.resume):
-        raise ValueError("Cannot find {}".format(opt.resume))
-    if os.path.isfile(opt.resume):
-        # paths = opt.resume.split("/")
-        try:
-            logdir = '/'.join(opt.resume.split('/')[:-1])
-            # idx = len(paths)-paths[::-1].index("logs")+1
-            print(f'Logdir is {logdir}')
-        except ValueError:
-            paths = opt.resume.split("/")
-            idx = -2  # take a guess: path/to/logdir/checkpoints/model.ckpt
-            logdir = "/".join(paths[:idx])
-        ckpt = opt.resume
-    else:
-        assert os.path.isdir(opt.resume), f"{opt.resume} is not a directory"
-        logdir = opt.resume.rstrip("/")
-        ckpt = os.path.join(logdir, "model.ckpt")
-
-    base_configs = sorted(glob.glob(os.path.join(logdir, "config.yaml")))
-    opt.base = base_configs
-
-    configs = [OmegaConf.load(cfg) for cfg in opt.base]
-    cli = OmegaConf.from_dotlist(unknown)
-    config = OmegaConf.merge(*configs, cli)
-
-    gpu = True
-    eval_mode = True
-
-    if opt.logdir != "none":
-        locallog = logdir.split(os.sep)[-1]
-        if locallog == "": locallog = logdir.split(os.sep)[-2]
-        print(f"Switching logdir from '{logdir}' to '{os.path.join(opt.logdir, locallog)}'")
-        logdir = os.path.join(opt.logdir, locallog)
-
-    print(config)
-
-    model, global_step = load_model(config, ckpt, gpu, eval_mode)
-    print(f"global step: {global_step}")
-    print(75 * "=")
-    print("logging to:")
-    logdir = os.path.join(logdir, "samples", f"{global_step:08}", now)
-    imglogdir = os.path.join(logdir, "img")
-    numpylogdir = os.path.join(logdir, "numpy")
-
-    os.makedirs(imglogdir)
-    os.makedirs(numpylogdir)
-    print(logdir)
-    print(75 * "=")
-
-    # write config out
-    sampling_file = os.path.join(logdir, "sampling_config.yaml")
-    sampling_conf = vars(opt)
-
-    with open(sampling_file, 'w') as f:
-        yaml.dump(sampling_conf, f, default_flow_style=False)
-    print(sampling_conf)
-
-
-    run(model, imglogdir, eta=opt.eta,
-        vanilla=opt.vanilla_sample,  n_samples=opt.n_samples, custom_steps=opt.custom_steps,
-        batch_size=opt.batch_size, nplog=numpylogdir)
-
-    print("done.")
diff --git a/examples/tutorial/stable_diffusion/scripts/tests/test_checkpoint.py b/examples/tutorial/stable_diffusion/scripts/tests/test_checkpoint.py
deleted file mode 100644
index a32e66d44cf2..000000000000
--- a/examples/tutorial/stable_diffusion/scripts/tests/test_checkpoint.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import os
-import sys
-from copy import deepcopy
-
-import yaml
-from datetime import datetime
-
-from diffusers import StableDiffusionPipeline
-import torch
-from ldm.util import instantiate_from_config
-from main import get_parser
-
-if __name__ == "__main__":
-    with torch.no_grad():
-        yaml_path = "../../train_colossalai.yaml"
-        with open(yaml_path, 'r', encoding='utf-8') as f:
-            config = f.read()
-        base_config = yaml.load(config, Loader=yaml.FullLoader)
-        unet_config = base_config['model']['params']['unet_config']
-        diffusion_model = instantiate_from_config(unet_config).to("cuda:0")
-
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "/data/scratch/diffuser/stable-diffusion-v1-4"
-        ).to("cuda:0")
-        dif_model_2 = pipe.unet
-
-        random_input_ = torch.rand((4, 4, 32, 32)).to("cuda:0")
-        random_input_2 = torch.clone(random_input_).to("cuda:0")
-        time_stamp = torch.randint(20, (4,)).to("cuda:0")
-        time_stamp2 = torch.clone(time_stamp).to("cuda:0")
-        context_ = torch.rand((4, 77, 768)).to("cuda:0")
-        context_2 = torch.clone(context_).to("cuda:0")
-
-        out_1 = diffusion_model(random_input_, time_stamp, context_)
-        out_2 = dif_model_2(random_input_2, time_stamp2, context_2)
-        print(out_1.shape)
-        print(out_2['sample'].shape)
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/scripts/tests/test_watermark.py b/examples/tutorial/stable_diffusion/scripts/tests/test_watermark.py
deleted file mode 100644
index f93f8a6e7076..000000000000
--- a/examples/tutorial/stable_diffusion/scripts/tests/test_watermark.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import cv2
-import fire
-from imwatermark import WatermarkDecoder
-
-
-def testit(img_path):
-    bgr = cv2.imread(img_path)
-    decoder = WatermarkDecoder('bytes', 136)
-    watermark = decoder.decode(bgr, 'dwtDct')
-    try:
-        dec = watermark.decode('utf-8')
-    except:
-        dec = "null"
-    print(dec)
-
-
-if __name__ == "__main__":
-    fire.Fire(testit)
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/scripts/train_searcher.py b/examples/tutorial/stable_diffusion/scripts/train_searcher.py
deleted file mode 100644
index 1e7904889c01..000000000000
--- a/examples/tutorial/stable_diffusion/scripts/train_searcher.py
+++ /dev/null
@@ -1,147 +0,0 @@
-import os, sys
-import numpy as np
-import scann
-import argparse
-import glob
-from multiprocessing import cpu_count
-from tqdm import tqdm
-
-from ldm.util import parallel_data_prefetch
-
-
-def search_bruteforce(searcher):
-    return searcher.score_brute_force().build()
-
-
-def search_partioned_ah(searcher, dims_per_block, aiq_threshold, reorder_k,
-                        partioning_trainsize, num_leaves, num_leaves_to_search):
-    return searcher.tree(num_leaves=num_leaves,
-                         num_leaves_to_search=num_leaves_to_search,
-                         training_sample_size=partioning_trainsize). \
-        score_ah(dims_per_block, anisotropic_quantization_threshold=aiq_threshold).reorder(reorder_k).build()
-
-
-def search_ah(searcher, dims_per_block, aiq_threshold, reorder_k):
-    return searcher.score_ah(dims_per_block, anisotropic_quantization_threshold=aiq_threshold).reorder(
-        reorder_k).build()
-
-def load_datapool(dpath):
-
-
-    def load_single_file(saved_embeddings):
-        compressed = np.load(saved_embeddings)
-        database = {key: compressed[key] for key in compressed.files}
-        return database
-
-    def load_multi_files(data_archive):
-        database = {key: [] for key in data_archive[0].files}
-        for d in tqdm(data_archive, desc=f'Loading datapool from {len(data_archive)} individual files.'):
-            for key in d.files:
-                database[key].append(d[key])
-
-        return database
-
-    print(f'Load saved patch embedding from "{dpath}"')
-    file_content = glob.glob(os.path.join(dpath, '*.npz'))
-
-    if len(file_content) == 1:
-        data_pool = load_single_file(file_content[0])
-    elif len(file_content) > 1:
-        data = [np.load(f) for f in file_content]
-        prefetched_data = parallel_data_prefetch(load_multi_files, data,
-                                                 n_proc=min(len(data), cpu_count()), target_data_type='dict')
-
-        data_pool = {key: np.concatenate([od[key] for od in prefetched_data], axis=1)[0] for key in prefetched_data[0].keys()}
-    else:
-        raise ValueError(f'No npz-files in specified path "{dpath}" is this directory existing?')
-
-    print(f'Finished loading of retrieval database of length {data_pool["embedding"].shape[0]}.')
-    return data_pool
-
-
-def train_searcher(opt,
-                   metric='dot_product',
-                   partioning_trainsize=None,
-                   reorder_k=None,
-                   # todo tune
-                   aiq_thld=0.2,
-                   dims_per_block=2,
-                   num_leaves=None,
-                   num_leaves_to_search=None,):
-
-    data_pool = load_datapool(opt.database)
-    k = opt.knn
-
-    if not reorder_k:
-        reorder_k = 2 * k
-
-    # normalize
-    # embeddings =
-    searcher = scann.scann_ops_pybind.builder(data_pool['embedding'] / np.linalg.norm(data_pool['embedding'], axis=1)[:, np.newaxis], k, metric)
-    pool_size = data_pool['embedding'].shape[0]
-
-    print(*(['#'] * 100))
-    print('Initializing scaNN searcher with the following values:')
-    print(f'k: {k}')
-    print(f'metric: {metric}')
-    print(f'reorder_k: {reorder_k}')
-    print(f'anisotropic_quantization_threshold: {aiq_thld}')
-    print(f'dims_per_block: {dims_per_block}')
-    print(*(['#'] * 100))
-    print('Start training searcher....')
-    print(f'N samples in pool is {pool_size}')
-
-    # this reflects the recommended design choices proposed at
-    # https://github.com/google-research/google-research/blob/aca5f2e44e301af172590bb8e65711f0c9ee0cfd/scann/docs/algorithms.md
-    if pool_size < 2e4:
-        print('Using brute force search.')
-        searcher = search_bruteforce(searcher)
-    elif 2e4 <= pool_size and pool_size < 1e5:
-        print('Using asymmetric hashing search and reordering.')
-        searcher = search_ah(searcher, dims_per_block, aiq_thld, reorder_k)
-    else:
-        print('Using using partioning, asymmetric hashing search and reordering.')
-
-        if not partioning_trainsize:
-            partioning_trainsize = data_pool['embedding'].shape[0] // 10
-        if not num_leaves:
-            num_leaves = int(np.sqrt(pool_size))
-
-        if not num_leaves_to_search:
-            num_leaves_to_search = max(num_leaves // 20, 1)
-
-        print('Partitioning params:')
-        print(f'num_leaves: {num_leaves}')
-        print(f'num_leaves_to_search: {num_leaves_to_search}')
-        # self.searcher = self.search_ah(searcher, dims_per_block, aiq_thld, reorder_k)
-        searcher = search_partioned_ah(searcher, dims_per_block, aiq_thld, reorder_k,
-                                                 partioning_trainsize, num_leaves, num_leaves_to_search)
-
-    print('Finish training searcher')
-    searcher_savedir = opt.target_path
-    os.makedirs(searcher_savedir, exist_ok=True)
-    searcher.serialize(searcher_savedir)
-    print(f'Saved trained searcher under "{searcher_savedir}"')
-
-if __name__ == '__main__':
-    sys.path.append(os.getcwd())
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--database',
-                        '-d',
-                        default='data/rdm/retrieval_databases/openimages',
-                        type=str,
-                        help='path to folder containing the clip feature of the database')
-    parser.add_argument('--target_path',
-                        '-t',
-                        default='data/rdm/searchers/openimages',
-                        type=str,
-                        help='path to the target folder where the searcher shall be stored.')
-    parser.add_argument('--knn',
-                        '-k',
-                        default=20,
-                        type=int,
-                        help='number of nearest neighbors, for which the searcher shall be optimized')
-
-    opt, _  = parser.parse_known_args()
-
-    train_searcher(opt,)
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/scripts/txt2img.py b/examples/tutorial/stable_diffusion/scripts/txt2img.py
deleted file mode 100644
index 59c16a1db871..000000000000
--- a/examples/tutorial/stable_diffusion/scripts/txt2img.py
+++ /dev/null
@@ -1,344 +0,0 @@
-import argparse, os, sys, glob
-import cv2
-import torch
-import numpy as np
-from omegaconf import OmegaConf
-from PIL import Image
-from tqdm import tqdm, trange
-from imwatermark import WatermarkEncoder
-from itertools import islice
-from einops import rearrange
-from torchvision.utils import make_grid
-import time
-from pytorch_lightning import seed_everything
-from torch import autocast
-from contextlib import contextmanager, nullcontext
-
-from ldm.util import instantiate_from_config
-from ldm.models.diffusion.ddim import DDIMSampler
-from ldm.models.diffusion.plms import PLMSSampler
-
-from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
-from transformers import AutoFeatureExtractor
-
-
-# load safety model
-safety_model_id = "CompVis/stable-diffusion-safety-checker"
-safety_feature_extractor = AutoFeatureExtractor.from_pretrained(safety_model_id)
-safety_checker = StableDiffusionSafetyChecker.from_pretrained(safety_model_id)
-
-
-def chunk(it, size):
-    it = iter(it)
-    return iter(lambda: tuple(islice(it, size)), ())
-
-
-def numpy_to_pil(images):
-    """
-    Convert a numpy image or a batch of images to a PIL image.
-    """
-    if images.ndim == 3:
-        images = images[None, ...]
-    images = (images * 255).round().astype("uint8")
-    pil_images = [Image.fromarray(image) for image in images]
-
-    return pil_images
-
-
-def load_model_from_config(config, ckpt, verbose=False):
-    print(f"Loading model from {ckpt}")
-    pl_sd = torch.load(ckpt, map_location="cpu")
-    if "global_step" in pl_sd:
-        print(f"Global Step: {pl_sd['global_step']}")
-    sd = pl_sd["state_dict"]
-    model = instantiate_from_config(config.model)
-    m, u = model.load_state_dict(sd, strict=False)
-    if len(m) > 0 and verbose:
-        print("missing keys:")
-        print(m)
-    if len(u) > 0 and verbose:
-        print("unexpected keys:")
-        print(u)
-
-    model.cuda()
-    model.eval()
-    return model
-
-
-def put_watermark(img, wm_encoder=None):
-    if wm_encoder is not None:
-        img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
-        img = wm_encoder.encode(img, 'dwtDct')
-        img = Image.fromarray(img[:, :, ::-1])
-    return img
-
-
-def load_replacement(x):
-    try:
-        hwc = x.shape
-        y = Image.open("assets/rick.jpeg").convert("RGB").resize((hwc[1], hwc[0]))
-        y = (np.array(y)/255.0).astype(x.dtype)
-        assert y.shape == x.shape
-        return y
-    except Exception:
-        return x
-
-
-def check_safety(x_image):
-    safety_checker_input = safety_feature_extractor(numpy_to_pil(x_image), return_tensors="pt")
-    x_checked_image, has_nsfw_concept = safety_checker(images=x_image, clip_input=safety_checker_input.pixel_values)
-    assert x_checked_image.shape[0] == len(has_nsfw_concept)
-    for i in range(len(has_nsfw_concept)):
-        if has_nsfw_concept[i]:
-            x_checked_image[i] = load_replacement(x_checked_image[i])
-    return x_checked_image, has_nsfw_concept
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--prompt",
-        type=str,
-        nargs="?",
-        default="a painting of a virus monster playing guitar",
-        help="the prompt to render"
-    )
-    parser.add_argument(
-        "--outdir",
-        type=str,
-        nargs="?",
-        help="dir to write results to",
-        default="outputs/txt2img-samples"
-    )
-    parser.add_argument(
-        "--skip_grid",
-        action='store_true',
-        help="do not save a grid, only individual samples. Helpful when evaluating lots of samples",
-    )
-    parser.add_argument(
-        "--skip_save",
-        action='store_true',
-        help="do not save individual samples. For speed measurements.",
-    )
-    parser.add_argument(
-        "--ddim_steps",
-        type=int,
-        default=50,
-        help="number of ddim sampling steps",
-    )
-    parser.add_argument(
-        "--plms",
-        action='store_true',
-        help="use plms sampling",
-    )
-    parser.add_argument(
-        "--laion400m",
-        action='store_true',
-        help="uses the LAION400M model",
-    )
-    parser.add_argument(
-        "--fixed_code",
-        action='store_true',
-        help="if enabled, uses the same starting code across samples ",
-    )
-    parser.add_argument(
-        "--ddim_eta",
-        type=float,
-        default=0.0,
-        help="ddim eta (eta=0.0 corresponds to deterministic sampling",
-    )
-    parser.add_argument(
-        "--n_iter",
-        type=int,
-        default=2,
-        help="sample this often",
-    )
-    parser.add_argument(
-        "--H",
-        type=int,
-        default=512,
-        help="image height, in pixel space",
-    )
-    parser.add_argument(
-        "--W",
-        type=int,
-        default=512,
-        help="image width, in pixel space",
-    )
-    parser.add_argument(
-        "--C",
-        type=int,
-        default=4,
-        help="latent channels",
-    )
-    parser.add_argument(
-        "--f",
-        type=int,
-        default=8,
-        help="downsampling factor",
-    )
-    parser.add_argument(
-        "--n_samples",
-        type=int,
-        default=3,
-        help="how many samples to produce for each given prompt. A.k.a. batch size",
-    )
-    parser.add_argument(
-        "--n_rows",
-        type=int,
-        default=0,
-        help="rows in the grid (default: n_samples)",
-    )
-    parser.add_argument(
-        "--scale",
-        type=float,
-        default=7.5,
-        help="unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))",
-    )
-    parser.add_argument(
-        "--from-file",
-        type=str,
-        help="if specified, load prompts from this file",
-    )
-    parser.add_argument(
-        "--config",
-        type=str,
-        default="configs/stable-diffusion/v1-inference.yaml",
-        help="path to config which constructs model",
-    )
-    parser.add_argument(
-        "--ckpt",
-        type=str,
-        default="models/ldm/stable-diffusion-v1/model.ckpt",
-        help="path to checkpoint of model",
-    )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=42,
-        help="the seed (for reproducible sampling)",
-    )
-    parser.add_argument(
-        "--precision",
-        type=str,
-        help="evaluate at this precision",
-        choices=["full", "autocast"],
-        default="autocast"
-    )
-    opt = parser.parse_args()
-
-    if opt.laion400m:
-        print("Falling back to LAION 400M model...")
-        opt.config = "configs/latent-diffusion/txt2img-1p4B-eval.yaml"
-        opt.ckpt = "models/ldm/text2img-large/model.ckpt"
-        opt.outdir = "outputs/txt2img-samples-laion400m"
-
-    seed_everything(opt.seed)
-
-    config = OmegaConf.load(f"{opt.config}")
-    model = load_model_from_config(config, f"{opt.ckpt}")
-
-    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-    model = model.to(device)
-
-    if opt.plms:
-        sampler = PLMSSampler(model)
-    else:
-        sampler = DDIMSampler(model)
-
-    os.makedirs(opt.outdir, exist_ok=True)
-    outpath = opt.outdir
-
-    print("Creating invisible watermark encoder (see https://github.com/ShieldMnt/invisible-watermark)...")
-    wm = "StableDiffusionV1"
-    wm_encoder = WatermarkEncoder()
-    wm_encoder.set_watermark('bytes', wm.encode('utf-8'))
-
-    batch_size = opt.n_samples
-    n_rows = opt.n_rows if opt.n_rows > 0 else batch_size
-    if not opt.from_file:
-        prompt = opt.prompt
-        assert prompt is not None
-        data = [batch_size * [prompt]]
-
-    else:
-        print(f"reading prompts from {opt.from_file}")
-        with open(opt.from_file, "r") as f:
-            data = f.read().splitlines()
-            data = list(chunk(data, batch_size))
-
-    sample_path = os.path.join(outpath, "samples")
-    os.makedirs(sample_path, exist_ok=True)
-    base_count = len(os.listdir(sample_path))
-    grid_count = len(os.listdir(outpath)) - 1
-
-    start_code = None
-    if opt.fixed_code:
-        start_code = torch.randn([opt.n_samples, opt.C, opt.H // opt.f, opt.W // opt.f], device=device)
-
-    precision_scope = autocast if opt.precision=="autocast" else nullcontext
-    with torch.no_grad():
-        with precision_scope("cuda"):
-            with model.ema_scope():
-                tic = time.time()
-                all_samples = list()
-                for n in trange(opt.n_iter, desc="Sampling"):
-                    for prompts in tqdm(data, desc="data"):
-                        uc = None
-                        if opt.scale != 1.0:
-                            uc = model.get_learned_conditioning(batch_size * [""])
-                        if isinstance(prompts, tuple):
-                            prompts = list(prompts)
-                        c = model.get_learned_conditioning(prompts)
-                        shape = [opt.C, opt.H // opt.f, opt.W // opt.f]
-                        samples_ddim, _ = sampler.sample(S=opt.ddim_steps,
-                                                         conditioning=c,
-                                                         batch_size=opt.n_samples,
-                                                         shape=shape,
-                                                         verbose=False,
-                                                         unconditional_guidance_scale=opt.scale,
-                                                         unconditional_conditioning=uc,
-                                                         eta=opt.ddim_eta,
-                                                         x_T=start_code)
-
-                        x_samples_ddim = model.decode_first_stage(samples_ddim)
-                        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
-                        x_samples_ddim = x_samples_ddim.cpu().permute(0, 2, 3, 1).numpy()
-
-                        x_checked_image, has_nsfw_concept = check_safety(x_samples_ddim)
-
-                        x_checked_image_torch = torch.from_numpy(x_checked_image).permute(0, 3, 1, 2)
-
-                        if not opt.skip_save:
-                            for x_sample in x_checked_image_torch:
-                                x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
-                                img = Image.fromarray(x_sample.astype(np.uint8))
-                                img = put_watermark(img, wm_encoder)
-                                img.save(os.path.join(sample_path, f"{base_count:05}.png"))
-                                base_count += 1
-
-                        if not opt.skip_grid:
-                            all_samples.append(x_checked_image_torch)
-
-                if not opt.skip_grid:
-                    # additionally, save as grid
-                    grid = torch.stack(all_samples, 0)
-                    grid = rearrange(grid, 'n b c h w -> (n b) c h w')
-                    grid = make_grid(grid, nrow=n_rows)
-
-                    # to image
-                    grid = 255. * rearrange(grid, 'c h w -> h w c').cpu().numpy()
-                    img = Image.fromarray(grid.astype(np.uint8))
-                    img = put_watermark(img, wm_encoder)
-                    img.save(os.path.join(outpath, f'grid-{grid_count:04}.png'))
-                    grid_count += 1
-
-                toc = time.time()
-
-    print(f"Your samples are ready and waiting for you here: \n{outpath} \n"
-          f" \nEnjoy.")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/tutorial/stable_diffusion/setup.py b/examples/tutorial/stable_diffusion/setup.py
deleted file mode 100644
index a24d54167640..000000000000
--- a/examples/tutorial/stable_diffusion/setup.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from setuptools import setup, find_packages
-
-setup(
-    name='latent-diffusion',
-    version='0.0.1',
-    description='',
-    packages=find_packages(),
-    install_requires=[
-        'torch',
-        'numpy',
-        'tqdm',
-    ],
-)
\ No newline at end of file
diff --git a/examples/tutorial/stable_diffusion/train.sh b/examples/tutorial/stable_diffusion/train.sh
deleted file mode 100644
index 63abcadbf62b..000000000000
--- a/examples/tutorial/stable_diffusion/train.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-HF_DATASETS_OFFLINE=1 
-TRANSFORMERS_OFFLINE=1 
-
-python main.py --logdir /tmp -t --postfix test -b configs/train_colossalai.yaml 

From bb4e9a311a7a32acb6370f39e6b1a3e4c250b885 Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Wed, 11 Jan 2023 10:07:37 +0800
Subject: [PATCH 150/209] [zero] add inference mode and its unit test (#2418)

---
 colossalai/gemini/gemini_mgr.py            |  18 ++-
 colossalai/nn/parallel/data_parallel.py    |  23 ++++
 tests/test_gemini/update/test_inference.py | 122 +++++++++++++++++++++
 3 files changed, 157 insertions(+), 6 deletions(-)
 create mode 100644 tests/test_gemini/update/test_inference.py

diff --git a/colossalai/gemini/gemini_mgr.py b/colossalai/gemini/gemini_mgr.py
index 08961b95832a..08fc0cf922d4 100644
--- a/colossalai/gemini/gemini_mgr.py
+++ b/colossalai/gemini/gemini_mgr.py
@@ -50,6 +50,17 @@ def __init__(self, placement_policy: str, chunk_manager: ChunkManager, memstats:
         self._warmup = True
         self._comp_cuda_demand_time = 0
 
+    def reset_attributes(self):
+        self._compute_idx = -1
+        self._h2d_volume = 0
+        self._d2h_volume = 0
+        self._layout_time = 0
+        self._evict_time = 0
+        self._comp_cuda_demand_time = 0
+
+    def is_warmup(self):
+        return self._warmup
+
     def memstats(self):
         """memstats
 
@@ -73,12 +84,7 @@ def post_iter(self):
         if self._mem_stats_collector and self._warmup:
             self._mem_stats_collector.finish_collection()
         self._warmup = False
-        self._compute_idx = -1
-        self._h2d_volume = 0
-        self._d2h_volume = 0
-        self._layout_time = 0
-        self._evict_time = 0
-        self._comp_cuda_demand_time = 0
+        self.reset_attributes()
 
     def adjust_layout(self, chunks: Tuple[Chunk, ...]) -> None:
         """ Adjust the layout of stateful tensors according to the information provided
diff --git a/colossalai/nn/parallel/data_parallel.py b/colossalai/nn/parallel/data_parallel.py
index a7d79be160d0..5e547059a937 100644
--- a/colossalai/nn/parallel/data_parallel.py
+++ b/colossalai/nn/parallel/data_parallel.py
@@ -268,12 +268,35 @@ def __init__(self,
 
         self._logger = get_dist_logger()
 
+    def _post_forward(self):
+        """This function is only triggered for inference.
+        """
+        access_list = list(self.chunk_manager.accessed_chunks)
+        # we need to scatter all accessed chunks and move them to their original places
+        for chunk in access_list:
+            assert chunk.can_release
+            self.chunk_manager.release_chunk(chunk)
+            first_param = next(iter(chunk.tensors_info))
+            self.chunk_manager.move_chunk(chunk, self.grads_device[first_param])
+        assert self.chunk_manager.accessed_mem == 0
+        # reset all recorded attributes
+        self.gemini_manager.reset_attributes()
+
     def forward(self, *args, **kwargs):
+        # check whether we are in a inference mode
+        grad_flag = torch.is_grad_enabled()
+        if not grad_flag:
+            assert not self.gemini_manager.is_warmup(), "You should run a completed iteration as your warmup iter"
+
         args, kwargs = _cast_float(args, torch.half), _cast_float(kwargs, torch.half)
         self.module.zero_grad(set_to_none=True)
         self.gemini_manager.pre_iter(*args)
         with ColoParamOpHookManager.use_hooks(self.param_op_hook):
             outputs = self.module(*args, **kwargs)
+        # scatter chunks in the inference mode
+        if not grad_flag:
+            self._post_forward()
+
         if self.force_outputs_fp32:
             return _cast_float(outputs, torch.float)
         return outputs
diff --git a/tests/test_gemini/update/test_inference.py b/tests/test_gemini/update/test_inference.py
new file mode 100644
index 000000000000..aec945fc9243
--- /dev/null
+++ b/tests/test_gemini/update/test_inference.py
@@ -0,0 +1,122 @@
+from functools import partial
+
+import pytest
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.testing import assert_close
+
+import colossalai
+from colossalai.amp import convert_to_apex_amp
+from colossalai.gemini.chunk import ChunkManager, init_chunk_manager, search_chunk_configuration
+from colossalai.gemini.gemini_mgr import GeminiManager
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.nn.optimizer.zero_optimizer import ZeroOptimizer
+from colossalai.nn.parallel import ZeroDDP
+from colossalai.testing import parameterize, rerun_if_address_is_in_use
+from colossalai.utils import free_port
+from colossalai.utils.cuda import get_current_device
+from colossalai.utils.model.colo_init_context import ColoInitContext, post_process_colo_init_ctx
+from tests.components_to_test import run_fwd_bwd
+from tests.components_to_test.registry import non_distributed_component_funcs
+from tests.test_tensor.common_utils import debug_print, set_seed
+
+
+def check_param(model: ZeroDDP, torch_model: torch.nn.Module):
+    zero_dict = model.state_dict(only_rank_0=False)
+    torch_dict = torch_model.state_dict()
+
+    for key, value in torch_dict.items():
+        # key is 'module.model.PARAMETER', so we truncate it
+        key = key[7:]
+        assert key in zero_dict, "{} not in ZeRO dictionary.".format(key)
+        temp_zero_value = zero_dict[key].to(device=value.device, dtype=value.dtype)
+        # debug_print([0], "max range: ", key, torch.max(torch.abs(value - temp_zero_value)))
+        assert_close(value, temp_zero_value, rtol=1e-3, atol=4e-3)
+
+
+@parameterize('placement_policy', ['cuda', 'cpu', 'auto', 'const'])
+@parameterize('model_name', ['gpt2'])
+def exam_inference(placement_policy, model_name: str):
+    set_seed(19360226)
+    get_components_func = non_distributed_component_funcs.get_callable(model_name)
+    model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
+
+    torch_model = model_builder().cuda()
+    amp_config = dict(opt_level='O2', keep_batchnorm_fp32=False, loss_scale=128)
+    torch_optim = torch.optim.Adam(torch_model.parameters(), lr=1e-3)
+    torch_model, torch_optim = convert_to_apex_amp(torch_model, torch_optim, amp_config)
+    torch_model = DDP(torch_model, device_ids=[dist.get_rank()])
+
+    init_dev = get_current_device()
+    with ColoInitContext(device=init_dev):
+        model = model_builder()
+
+    for torch_p, p in zip(torch_model.parameters(), model.parameters()):
+        p.data.copy_(torch_p.data)
+
+    world_size = torch.distributed.get_world_size()
+    config_dict, _ = search_chunk_configuration(model, search_range_mb=1, search_interval_byte=100)
+    config_dict[world_size]['chunk_size'] = 5000
+    config_dict[world_size]['keep_gathered'] = False
+    if placement_policy != 'cuda':
+        init_device = torch.device('cpu')
+    else:
+        init_device = None
+    chunk_manager = ChunkManager(config_dict, init_device=init_device)
+    gemini_manager = GeminiManager(placement_policy, chunk_manager)
+    model = ZeroDDP(model, gemini_manager, pin_memory=True)
+
+    optimizer = HybridAdam(model.parameters(), lr=1e-3)
+    zero_optim = ZeroOptimizer(optimizer, model, initial_scale=128)
+
+    model.eval()
+    torch_model.eval()
+
+    set_seed(dist.get_rank() * 3 + 128)
+    train_dataloader = iter(train_dataloader)
+
+    def train_iter():
+        input_ids, label = next(train_dataloader)
+        input_ids, label = input_ids.cuda(), label.cuda()
+        zero_optim.zero_grad()
+        torch_optim.zero_grad()
+        torch_loss = run_fwd_bwd(torch_model, input_ids, label, criterion, torch_optim)
+        loss = run_fwd_bwd(model, input_ids, label, criterion, zero_optim)
+        assert_close(torch_loss, loss)
+        zero_optim.step()
+        torch_optim.step()
+        check_param(model, torch_model)
+
+    def inference_iter():
+        input_ids, label = next(train_dataloader)
+        input_ids, label = input_ids.cuda(), label.cuda()
+        with torch.no_grad():
+            torch_output = torch_model(input_ids)
+            torch_loss = criterion(torch_output.float(), label)
+            zero_output = model(input_ids)
+            zero_loss = criterion(zero_output.float(), label)
+        assert_close(torch_loss, zero_loss)
+
+    train_iter()
+    inference_iter()
+    train_iter()
+
+
+def run_dist(rank, world_size, port):
+    config = {}
+    colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    exam_inference()
+
+
+@pytest.mark.dist
+@pytest.mark.parametrize('world_size', [1, 4])
+@rerun_if_address_is_in_use()
+def test_inference(world_size):
+    run_func = partial(run_dist, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_inference(1)

From 21256674e99eef3da80b9572238ee2aef04b21a3 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 11 Jan 2023 10:44:52 +0800
Subject: [PATCH 151/209] [workflow] report test coverage even if below
 threshold (#2431)

---
 .github/workflows/report_test_coverage.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/report_test_coverage.yml b/.github/workflows/report_test_coverage.yml
index 167aa28b6b62..361eae8e4b75 100644
--- a/.github/workflows/report_test_coverage.yml
+++ b/.github/workflows/report_test_coverage.yml
@@ -40,7 +40,6 @@ jobs:
         with:
           filename: coverage.xml
           badge: true
-          fail_below_min: true
           format: markdown
           hide_branch_rate: false
           hide_complexity: false

From a3e549615627c7893f1b7189719644a02d0f0319 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 11 Jan 2023 10:46:32 +0800
Subject: [PATCH 152/209] [example] improved the clarity yof the example readme
 (#2427)

* [example] improved the clarity yof the example readme

* polish workflow

* polish workflow

* polish workflow

* polish workflow

* polish workflow

* polish workflow
---
 .github/workflows/auto_example_check.yml |  5 ++-
 examples/README.md                       | 48 +++++++++++++++---------
 2 files changed, 33 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/auto_example_check.yml b/.github/workflows/auto_example_check.yml
index d9063bad9f33..f88b6858e003 100644
--- a/.github/workflows/auto_example_check.yml
+++ b/.github/workflows/auto_example_check.yml
@@ -39,7 +39,7 @@ jobs:
           res=`python .github/workflows/scripts/example_checks/detect_changed_example.py --fileNameList $changedFileName`
           echo "All changed examples are $res"
 
-          if [ "$x" = "[]" ]; then
+          if [ "$res" = "[]" ]; then
             echo "anyChanged=false" >> $GITHUB_OUTPUT
             echo "matrix=null" >> $GITHUB_OUTPUT
           else
@@ -54,7 +54,8 @@ jobs:
     if: |
         github.event.pull_request.draft == false &&
         github.base_ref == 'main' &&
-        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request'
+        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request' &&
+        needs.detect-changed-example.outputs.anyChanged == 'true'
     name: Test the changed example
     needs: detect-changed-example
     runs-on: [self-hosted, gpu]
diff --git a/examples/README.md b/examples/README.md
index 53ab0896da0b..78facea5406d 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,28 +1,40 @@
-## Examples folder document
+# Colossal-AI Examples
 
 ## Table of Contents
-<ul>
- <li><a href="#Example-folder-description">Example folder description</a> </li>
- <li><a href="#Integrate-Your-Example-With-System-Testing">Integrate Your Example With System Testing</a> </li>
-</ul>
 
-## Example folder description
+- [Colossal-AI Examples](#colossal-ai-examples)
+  - [Table of Contents](#table-of-contents)
+  - [Overview](#overview)
+  - [Folder Structure](#folder-structure)
+  - [Integrate Your Example With Testing](#integrate-your-example-with-testing)
 
-This folder provides several examples using colossalai. The images folder includes model like diffusion, dreambooth and vit. The language folder includes gpt, opt, palm and roberta. The tutorial folder is for concept illustration, such as auto-parallel, hybrid-parallel and so on.
+## Overview
 
+This folder provides several examples accelerated by Colossal-AI. The `tutorial` folder is for everyone to quickly try out the different features in Colossal-AI. Other folders such as `images` and `language` include a wide range of deep learning tasks and applications.
 
-## Integrate Your Example With System Testing
+## Folder Structure
 
-For example code contributor, to meet the expectation and test your code automatically using github workflow function, here are several steps:
+```text
+└─ examples
+  └─ images
+      └─ vit
+        └─ test_ci.sh
+        └─ train.py
+        └─ README.md
+      └─ ...
+  └─ ...
+```
 
+## Integrate Your Example With Testing
 
-- (must) Have a test_ci.sh file in the folder like shown below in 'File Structure Chart'
-- The dataset should be located in the company's machine and can be announced using environment variable and thus no need for a separate terminal command.
-- The model parameters should be small to allow fast testing.
-- File Structure Chart
+Regular checks are important to ensure that all examples run without apparent bugs and stay compatible with the latest API.
+Colossal-AI runs workflows to check for examples on a on-pull-request and weekly basis.
+When a new example is added or changed, the workflow will run the example to test whether it can run.
+Moreover, Colossal-AI will run testing for examples every week.
 
-       └─examples
-          └─images
-              └─vit
-                └─requirements.txt
-                └─test_ci.sh
+Therefore, it is essential for the example contributors to know how to integrate your example with the testing workflow. Simply, you can follow the steps below.
+
+1. Create a script called `test_ci.sh` in your example folder
+2. Configure your testing parameters such as number steps, batch size in `test_ci.sh`, e.t.c. Keep these parameters small such that each example only takes several minutes.
+3. Export your dataset path with the prefix `/data` and make sure you have a copy of the dataset in the `/data/scratch/examples-data` directory on the CI machine. Community contributors can contact us via slack to request for downloading the dataset on the CI machine.
+4. Implement the logic such as dependency setup and example execution

From 7829aa094e2835273d7b0616369dcae3d083274f Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Wed, 11 Jan 2023 12:22:45 +0800
Subject: [PATCH 153/209] [ddp] add is_ddp_ignored (#2434)

[ddp] rename to is_ddp_ignored
---
 colossalai/gemini/chunk/search_utils.py   |  9 ++---
 colossalai/gemini/chunk/utils.py          |  6 ++--
 colossalai/nn/optimizer/zero_optimizer.py |  4 +--
 colossalai/nn/parallel/data_parallel.py   | 12 +++----
 colossalai/utils/__init__.py              | 42 ++++++++++++++++++-----
 colossalai/utils/common.py                |  8 +++--
 colossalai/zero/utils/gemini_hook.py      |  5 +--
 7 files changed, 56 insertions(+), 30 deletions(-)

diff --git a/colossalai/gemini/chunk/search_utils.py b/colossalai/gemini/chunk/search_utils.py
index 312d77f1826c..572c3d94531f 100644
--- a/colossalai/gemini/chunk/search_utils.py
+++ b/colossalai/gemini/chunk/search_utils.py
@@ -6,17 +6,14 @@
 
 from colossalai.gemini.memory_tracer import MemStats, OrderedParamGenerator
 from colossalai.tensor import ColoParameter
-
-
-def in_ddp(param: nn.Parameter) -> bool:
-    return not getattr(param, '_ddp_to_ignore', False)
+from colossalai.utils import is_ddp_ignored
 
 
 def _filter_exlarge_params(model: nn.Module, size_dict: Dict[int, List[int]]) -> None:
     """
     Filter those parameters whose size is too large (more than 3x standard deviations) from others.
     """
-    params_size = [p.numel() for p in model.parameters() if in_ddp(p)]
+    params_size = [p.numel() for p in model.parameters() if not is_ddp_ignored(p)]
     params_size_arr = np.array(params_size)
 
     std = np.std(params_size_arr)
@@ -56,7 +53,7 @@ def classify_params_by_dp_degree(param_order: OrderedParamGenerator) -> Dict[int
     params_dict: Dict[int, List[ColoParameter]] = dict()
     for param in param_order.generate():
         assert isinstance(param, ColoParameter), "please init model in the ColoInitContext"
-        if not in_ddp(param):
+        if is_ddp_ignored(param):
             continue
 
         param_key = param.process_group.dp_world_size()
diff --git a/colossalai/gemini/chunk/utils.py b/colossalai/gemini/chunk/utils.py
index e9a9f84e7a93..883022fe89b8 100644
--- a/colossalai/gemini/chunk/utils.py
+++ b/colossalai/gemini/chunk/utils.py
@@ -6,8 +6,8 @@
 import torch.nn as nn
 
 from colossalai.gemini.chunk import ChunkManager
-from colossalai.gemini.chunk.search_utils import in_ddp, search_chunk_configuration
-from colossalai.gemini.memory_tracer import MemStats
+from colossalai.gemini.chunk.search_utils import search_chunk_configuration
+from colossalai.utils import is_ddp_ignored
 
 
 def init_chunk_manager(model: nn.Module,
@@ -34,7 +34,7 @@ def init_chunk_manager(model: nn.Module,
     if filter_exlarge_params:
         kwargs_dict["filter_exlarge_params"] = filter_exlarge_params
 
-    params_sizes = [p.numel() for p in model.parameters() if in_ddp(p)]
+    params_sizes = [p.numel() for p in model.parameters() if not is_ddp_ignored(p)]
     total_size = sum(params_sizes) / 1024**2
 
     dist.barrier()
diff --git a/colossalai/nn/optimizer/zero_optimizer.py b/colossalai/nn/optimizer/zero_optimizer.py
index 7f9d2fe8fc97..3dd9d1e93b36 100644
--- a/colossalai/nn/optimizer/zero_optimizer.py
+++ b/colossalai/nn/optimizer/zero_optimizer.py
@@ -12,7 +12,7 @@
 from colossalai.logging import get_dist_logger
 from colossalai.nn.optimizer import ColossalaiOptimizer, CPUAdam, FusedAdam, HybridAdam
 from colossalai.nn.parallel.data_parallel import ZeroDDP
-from colossalai.utils import disposable, get_current_device
+from colossalai.utils import disposable, get_current_device, is_ddp_ignored
 
 _AVAIL_OPTIM_LIST = {FusedAdam, CPUAdam, HybridAdam}
 
@@ -78,7 +78,7 @@ def __init__(self,
         if self.clipping_flag:
             assert norm_type == 2.0, "ZeroOptimizer only supports L2 norm now"
 
-        params_list = [p for p in module.parameters() if not getattr(p, '_ddp_to_ignore', False)]
+        params_list = [p for p in module.parameters() if not is_ddp_ignored(p)]
         for p, fp32_p in zip(params_list, module.fp32_params):
             chunk_16 = self.chunk_manager.get_chunk(p)
             if chunk_16 not in self.chunk16_set:
diff --git a/colossalai/nn/parallel/data_parallel.py b/colossalai/nn/parallel/data_parallel.py
index 5e547059a937..649bd920d3b2 100644
--- a/colossalai/nn/parallel/data_parallel.py
+++ b/colossalai/nn/parallel/data_parallel.py
@@ -14,7 +14,7 @@
 from colossalai.tensor import ProcessGroup as ColoProcessGroup
 from colossalai.tensor.colo_parameter import ColoParameter, ColoTensor, ColoTensorSpec
 from colossalai.tensor.param_op_hook import ColoParamOpHookManager
-from colossalai.utils import get_current_device
+from colossalai.utils import get_current_device, is_ddp_ignored
 from colossalai.zero.utils.gemini_hook import GeminiZeROHook
 
 from .reducer import Reducer
@@ -81,7 +81,7 @@ def __init__(self,
         self.reducer = Reducer(bucket_cap_mb)
         self.rebuild_bucket = rebuild_bucket
         for p in module.parameters():
-            if getattr(p, '_ddp_to_ignore', False):
+            if is_ddp_ignored(p):
                 continue
             if p.requires_grad:
                 p.register_hook(partial(self.grad_handle, p))
@@ -116,7 +116,7 @@ def backward(self, loss: torch.Tensor):
         if self.rebuild_bucket:
             self.reducer.free()
         for p in self.module.parameters():
-            if getattr(p, '_ddp_to_ignore', False):
+            if is_ddp_ignored(p):
                 continue
             if p.grad.device.type != "cpu":
                 p.grad = p._saved_grad
@@ -232,7 +232,7 @@ def __init__(self,
         for p in param_order.generate():
             assert isinstance(p, ColoParameter)
 
-            if getattr(p, '_ddp_to_ignore', False):
+            if is_ddp_ignored(p):
                 p.data = p.data.half()
                 continue
 
@@ -256,7 +256,7 @@ def __init__(self,
         self.chunk_manager.close_all_groups()
         self._cast_buffers()
 
-        params_list = [p for p in param_order.generate() if not getattr(p, '_ddp_to_ignore', False)]
+        params_list = [p for p in param_order.generate() if not is_ddp_ignored(p)]
         for p, fp32_p in zip(params_list, self.fp32_params):
             chunk_16 = self.chunk_manager.get_chunk(p)
             chunk_32 = self.chunk_manager.get_chunk(fp32_p)
@@ -303,7 +303,7 @@ def forward(self, *args, **kwargs):
 
     def _setup_grads_ptr(self):
         for p in self.module.parameters():
-            if getattr(p, '_ddp_to_ignore', False):
+            if is_ddp_ignored(p):
                 continue
             p.grad = None
 
diff --git a/colossalai/utils/__init__.py b/colossalai/utils/__init__.py
index 875b5a93ba4f..3f16bd91e5fe 100644
--- a/colossalai/utils/__init__.py
+++ b/colossalai/utils/__init__.py
@@ -1,22 +1,46 @@
-from .cuda import empty_cache, get_current_device, set_to_cuda, synchronize
 from .activation_checkpoint import checkpoint
 from .checkpointing import load_checkpoint, save_checkpoint
-from .common import (clip_grad_norm_fp32, conditional_context, copy_tensor_parallel_attributes, count_zeros_fp32,
-                     ensure_path_exists, free_port, is_dp_rank_0, is_model_parallel_parameter, is_no_pp_or_last_stage,
-                     is_tp_rank_0, is_using_ddp, is_using_pp, is_using_sequence, multi_tensor_applier,
-                     param_is_not_tensor_parallel_duplicate, print_rank_0, switch_virtual_pipeline_parallel_rank,
-                     sync_model_param, disposable)
+from .common import (
+    clip_grad_norm_fp32,
+    conditional_context,
+    copy_tensor_parallel_attributes,
+    count_zeros_fp32,
+    disposable,
+    ensure_path_exists,
+    free_port,
+    is_ddp_ignored,
+    is_dp_rank_0,
+    is_model_parallel_parameter,
+    is_no_pp_or_last_stage,
+    is_tp_rank_0,
+    is_using_ddp,
+    is_using_pp,
+    is_using_sequence,
+    multi_tensor_applier,
+    param_is_not_tensor_parallel_duplicate,
+    print_rank_0,
+    switch_virtual_pipeline_parallel_rank,
+    sync_model_param,
+)
+from .cuda import empty_cache, get_current_device, set_to_cuda, synchronize
 from .data_sampler import DataParallelSampler, get_dataloader
-from .memory import (report_memory_usage, colo_device_memory_used, colo_set_process_memory_fraction,
-                     colo_device_memory_capacity, colo_set_cpu_memory_capacity, colo_get_cpu_memory_capacity)
-from .timer import MultiTimer, Timer
+from .memory import (
+    colo_device_memory_capacity,
+    colo_device_memory_used,
+    colo_get_cpu_memory_capacity,
+    colo_set_cpu_memory_capacity,
+    colo_set_process_memory_fraction,
+    report_memory_usage,
+)
 from .tensor_detector import TensorDetector
+from .timer import MultiTimer, Timer
 
 __all__ = [
     'checkpoint',
     'free_port',
     'print_rank_0',
     'sync_model_param',
+    'is_ddp_ignored',
     'is_dp_rank_0',
     'is_tp_rank_0',
     'is_no_pp_or_last_stage',
diff --git a/colossalai/utils/common.py b/colossalai/utils/common.py
index 7575fa292f14..2099883fbdf7 100644
--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@@ -126,14 +126,18 @@ def is_model_parallel_parameter(p):
     return hasattr(p, IS_TENSOR_PARALLEL) and getattr(p, IS_TENSOR_PARALLEL)
 
 
+def is_ddp_ignored(p):
+    return getattr(p, '_ddp_to_ignore', False)
+
+
 def _calc_l2_norm(grads):
-    # we should not 
+    # we should not
     global fused_optim
 
     if fused_optim is None:
         from colossalai.kernel.op_builder import FusedOptimBuilder
         fused_optim = FusedOptimBuilder().load()
-        
+
     norm = 0.0
     if len(grads) > 0:
         dummy_overflow_buf = torch.cuda.IntTensor([0])
diff --git a/colossalai/zero/utils/gemini_hook.py b/colossalai/zero/utils/gemini_hook.py
index 35569c7172b3..bddc307a0504 100644
--- a/colossalai/zero/utils/gemini_hook.py
+++ b/colossalai/zero/utils/gemini_hook.py
@@ -8,6 +8,7 @@
 from colossalai.gemini import TensorState
 from colossalai.gemini.gemini_mgr import GeminiManager
 from colossalai.tensor.param_op_hook import ColoParamOpHook
+from colossalai.utils import is_ddp_ignored
 
 
 class TrainingPhase(Enum):
@@ -24,7 +25,7 @@ def __init__(self, gemini_manager: GeminiManager) -> None:
         self._training_phase = TrainingPhase.FORWARD
 
     def pre_op(self, params):
-        params = [p for p in params if not getattr(p, '_ddp_to_ignore', False)]
+        params = [p for p in params if not is_ddp_ignored(p)]
         chunks = self._chunk_manager.get_chunks(params)
         for p in params:
             self._chunk_manager.trans_tensor_state(p, TensorState.COMPUTE)
@@ -37,7 +38,7 @@ def pre_op(self, params):
         self._gemini_manager.record_model_data_volume()
 
     def post_op(self, params):
-        params = [p for p in params if not getattr(p, '_ddp_to_ignore', False)]
+        params = [p for p in params if not is_ddp_ignored(p)]
         for p in params:
             tensor_state = TensorState.HOLD if self._training_phase == TrainingPhase.FORWARD or not p.requires_grad else TensorState.HOLD_AFTER_BWD
             self._chunk_manager.trans_tensor_state(p, tensor_state)

From 1b7587d95891e972553ef0e9b06614706f783bfc Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 11 Jan 2023 13:37:48 +0800
Subject: [PATCH 154/209] [workflow] make test coverage report collapsable
 (#2436)

---
 .github/workflows/report_test_coverage.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/report_test_coverage.yml b/.github/workflows/report_test_coverage.yml
index 361eae8e4b75..dc3fe395f00b 100644
--- a/.github/workflows/report_test_coverage.yml
+++ b/.github/workflows/report_test_coverage.yml
@@ -47,6 +47,12 @@ jobs:
           output: both
           thresholds: '80 90'
 
+      - name: Make Coverage Report Collapsable
+        run: |
+          sed -i '2 i <details>' code-coverage-results.md
+          sed -i '3 i <summary>Click me to view the complete report</summary>' code-coverage-results.md
+          echo "</details>" >> code-coverage-results.md
+
       - name: 'Comment on PR'
         uses: actions/github-script@v6
         with:

From 41429b9b28b1e826a13f74cd71c7dfdfcad86300 Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Wed, 11 Jan 2023 13:40:33 +0800
Subject: [PATCH 155/209] [autoparallel] add shard option (#2423)

---
 .../tensor_shard/node_handler/__init__.py     |   3 +-
 .../tensor_shard/node_handler/node_handler.py |  18 +++
 .../tensor_shard/node_handler/option.py       |  17 +++
 .../test_node_handler/test_shard_option.py    | 112 ++++++++++++++++++
 4 files changed, 149 insertions(+), 1 deletion(-)
 create mode 100644 colossalai/auto_parallel/tensor_shard/node_handler/option.py
 create mode 100644 tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_shard_option.py

diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/__init__.py b/colossalai/auto_parallel/tensor_shard/node_handler/__init__.py
index a5e3f649a345..87bd8966bb70 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/__init__.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/__init__.py
@@ -11,6 +11,7 @@
 from .linear_handler import LinearFunctionHandler, LinearModuleHandler
 from .matmul_handler import MatMulHandler
 from .normal_pooling_handler import NormPoolingHandler
+from .option import ShardOption
 from .output_handler import OutputHandler
 from .placeholder_handler import PlaceholderHandler
 from .registry import operator_registry
@@ -27,5 +28,5 @@
     'UnaryElementwiseHandler', 'ReshapeHandler', 'PlaceholderHandler', 'OutputHandler', 'WhereHandler',
     'NormPoolingHandler', 'BinaryElementwiseHandler', 'MatMulHandler', 'operator_registry', 'ADDMMFunctionHandler',
     'GetItemHandler', 'GetattrHandler', 'ViewHandler', 'PermuteHandler', 'TensorConstructorHandler',
-    'EmbeddingModuleHandler', 'EmbeddingFunctionHandler', 'SumHandler', 'SoftmaxHandler'
+    'EmbeddingModuleHandler', 'EmbeddingFunctionHandler', 'SumHandler', 'SoftmaxHandler', 'ShardOption'
 ]
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py
index 78dc58c905ec..fbab2b61e5af 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py
@@ -5,6 +5,7 @@
 from torch.fx.node import Node
 
 from colossalai.auto_parallel.meta_profiler.metainfo import MetaInfo, meta_register
+from colossalai.auto_parallel.tensor_shard.node_handler.option import ShardOption
 from colossalai.auto_parallel.tensor_shard.sharding_strategy import (
     OperationData,
     OperationDataType,
@@ -35,12 +36,14 @@ def __init__(
         node: Node,
         device_mesh: DeviceMesh,
         strategies_vector: StrategiesVector,
+        shard_option: ShardOption = ShardOption.STANDARD,
     ) -> None:
         self.node = node
         self.predecessor_node = list(node._input_nodes.keys())
         self.successor_node = list(node.users.keys())
         self.device_mesh = device_mesh
         self.strategies_vector = strategies_vector
+        self.shard_option = shard_option
 
     def update_resharding_cost(self, strategy: ShardingStrategy) -> None:
         """
@@ -181,6 +184,21 @@ def register_strategy(self, compute_resharding_cost: bool = True) -> StrategiesV
                 if op_data.data is not None and isinstance(op_data.data, torch.Tensor):
                     check_sharding_spec_validity(sharding_spec, op_data.data)
 
+        remove_strategy_list = []
+        for strategy in self.strategies_vector:
+            shard_level = 0
+            for op_data, sharding_spec in strategy.sharding_specs.items():
+                if op_data.data is not None and isinstance(op_data.data, torch.Tensor):
+                    for dim, shard_axis in sharding_spec.dim_partition_dict.items():
+                        shard_level += len(shard_axis)
+            if self.shard_option == ShardOption.SHARD and shard_level == 0:
+                remove_strategy_list.append(strategy)
+            if self.shard_option == ShardOption.FULL_SHARD and shard_level <= 1:
+                remove_strategy_list.append(strategy)
+
+        for strategy in remove_strategy_list:
+            self.strategies_vector.remove(strategy)
+
         return self.strategies_vector
 
     def post_process(self, strategy: ShardingStrategy) -> Union[ShardingStrategy, List[ShardingStrategy]]:
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/option.py b/colossalai/auto_parallel/tensor_shard/node_handler/option.py
new file mode 100644
index 000000000000..dffb0386df62
--- /dev/null
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/option.py
@@ -0,0 +1,17 @@
+from enum import Enum
+
+__all__ = ['ShardOption']
+
+
+class ShardOption(Enum):
+    """
+    This enum class is to define the shard level required in node strategies.
+
+    Notes:
+        STANDARD: We do not add any extra shard requirements.
+        SHARD: We require the node to be shard using at least one device mesh axis.
+        FULL_SHARD: We require the node to be shard using all device mesh axes.
+    """
+    STANDARD = 0
+    SHARD = 1
+    FULL_SHARD = 2
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_shard_option.py b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_shard_option.py
new file mode 100644
index 000000000000..fda0411104b8
--- /dev/null
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_shard_option.py
@@ -0,0 +1,112 @@
+from functools import partial
+
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+
+from colossalai.auto_parallel.tensor_shard.node_handler import LinearFunctionHandler
+from colossalai.auto_parallel.tensor_shard.node_handler.option import ShardOption
+from colossalai.auto_parallel.tensor_shard.sharding_strategy import StrategiesVector
+from colossalai.device.device_mesh import DeviceMesh
+from colossalai.fx import ColoGraphModule, ColoTracer
+from colossalai.testing import parameterize
+from colossalai.testing.pytest_wrapper import run_on_environment_flag
+from colossalai.testing.utils import parameterize
+
+
+class LinearModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input, others, bias=None):
+        x = nn.functional.linear(input, others, bias=bias)
+        return x
+
+
+def check_shard_option(shard_option):
+    model = LinearModel().cuda()
+    physical_mesh_id = torch.arange(0, 4)
+    mesh_shape = (2, 2)
+    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
+
+    tracer = ColoTracer()
+    graph = tracer.trace(model,
+                         meta_args={
+                             "input": torch.rand(4, 4, 4, 16).to('meta'),
+                             'others': torch.rand(32, 16).to('meta')
+                         })
+    gm = ColoGraphModule(model, graph)
+    linear_func_node = list(graph.nodes)[2]
+    strategies_vector = StrategiesVector(linear_func_node)
+
+    # build handler
+    handler = LinearFunctionHandler(node=linear_func_node,
+                                    device_mesh=device_mesh,
+                                    strategies_vector=strategies_vector,
+                                    shard_option=shard_option)
+
+    strategies_vector = handler.register_strategy(compute_resharding_cost=False)
+    strategy_name_list = [val.name for val in strategies_vector]
+
+    # SS = SR x RS
+    assert 'S1S0 = S1R x RS0_0' in strategy_name_list
+    assert 'S0S1 = S0R x RS1_1' in strategy_name_list
+    assert 'S0S1 = S0R x RS1_2' in strategy_name_list
+    assert 'S0S1 = S0R x RS1_0' in strategy_name_list
+    assert 'S1S0 = S1R x RS0_1' in strategy_name_list
+    assert 'S1S0 = S1R x RS0_2' in strategy_name_list
+
+    # SR = SS x SR
+    assert 'S0R = S0S1 x S1R_1' in strategy_name_list
+    assert 'S0R = S0S1 x S1R_2' in strategy_name_list
+    assert 'S1R = S1S0 x S0R_0' in strategy_name_list
+    assert 'S0R = S0S1 x S1R_0' in strategy_name_list
+    assert 'S1R = S1S0 x S0R_1' in strategy_name_list
+    assert 'S1R = S1S0 x S0R_2' in strategy_name_list
+
+    # RS = RS x SS
+    assert 'RS0 = RS1 x S1S0' in strategy_name_list
+    assert 'RS1 = RS0 x S0S1' in strategy_name_list
+
+    # S01R = S01R x RR
+    assert 'S01R = S01R x RR_0' in strategy_name_list
+    assert 'S01R = S01R x RR_1' in strategy_name_list
+    assert 'S01R = S01R x RR_2' in strategy_name_list
+
+    # RR = RS01 x S01R
+    assert 'RR = RS01 x S01R' in strategy_name_list
+
+    # RS01 = RR x RS01
+    assert 'RS01 = RR x RS01' in strategy_name_list
+
+    if shard_option == ShardOption.SHARD:
+        # RR = RS x SR
+        assert 'RR = RS0 x S0R' in strategy_name_list
+        assert 'RR = RS1 x S1R' in strategy_name_list
+
+        # RS= RR x RS
+        assert 'RS0 = RR x RS0' in strategy_name_list
+        assert 'RS1 = RR x RS1' in strategy_name_list
+
+    if shard_option == ShardOption.STANDARD:
+        # RR = RS x SR
+        assert 'RR = RS0 x S0R' in strategy_name_list
+        assert 'RR = RS1 x S1R' in strategy_name_list
+
+        # RS= RR x RS
+        assert 'RS0 = RR x RS0' in strategy_name_list
+        assert 'RS1 = RR x RS1' in strategy_name_list
+
+        # RR = RR x RR
+        assert 'RR = RR x RR' in strategy_name_list
+
+
+@run_on_environment_flag(name='AUTO_PARALLEL')
+def test_shard_option():
+    for shard_option in [ShardOption.STANDARD, ShardOption.SHARD, ShardOption.FULL_SHARD]:
+        check_shard_option(shard_option)
+
+
+if __name__ == '__main__':
+    test_shard_option()

From c41e59e5adc27d08b17234eada91ebcb3d876b23 Mon Sep 17 00:00:00 2001
From: Super Daniel <78588128+super-dainiu@users.noreply.github.com>
Date: Wed, 11 Jan 2023 13:49:59 +0800
Subject: [PATCH 156/209] [fx] allow native ckpt trace and codegen. (#2438)

---
 colossalai/fx/graph_module.py           | 15 ++++++---
 colossalai/fx/tracer/_symbolic_trace.py |  3 +-
 colossalai/fx/tracer/experimental.py    | 42 +++++++++++++++----------
 3 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/colossalai/fx/graph_module.py b/colossalai/fx/graph_module.py
index fbafd326c6d4..2d6a71f19e16 100644
--- a/colossalai/fx/graph_module.py
+++ b/colossalai/fx/graph_module.py
@@ -1,17 +1,21 @@
 import os
 import warnings
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Set, Type, Union
+
 import torch
 import torch.nn as nn
 from torch.nn.modules.module import _addindent
-from typing import Type, Dict, List, Any, Union, Optional, Set
-from pathlib import Path
+
 try:
-    from torch.fx.graph_module import GraphModule, _EvalCacheLoader, _WrappedCall, _exec_with_source, _forward_from_src
-    from torch.fx.graph import Graph, _PyTreeCodeGen, _is_from_torch, _custom_builtins, PythonCode
+    from torch.fx.graph import Graph, PythonCode, _custom_builtins, _is_from_torch, _PyTreeCodeGen
+    from torch.fx.graph_module import GraphModule, _EvalCacheLoader, _exec_with_source, _forward_from_src, _WrappedCall
+
+    from colossalai.fx.codegen.activation_checkpoint_codegen import ActivationCheckpointCodeGen
     COLOGM = True
 except:
-    from torch.fx.graph_module import GraphModule
     from torch.fx.graph import Graph
+    from torch.fx.graph_module import GraphModule
     COLOGM = False
 
 if COLOGM:
@@ -19,6 +23,7 @@
     class ColoGraphModule(GraphModule):
 
         def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph, class_name: str = 'GraphModule'):
+            graph.set_codegen(ActivationCheckpointCodeGen())
             super().__init__(root, graph, class_name)
 
         def bind(self, ckpt_def, globals):
diff --git a/colossalai/fx/tracer/_symbolic_trace.py b/colossalai/fx/tracer/_symbolic_trace.py
index bff2f6a10fa6..5c04eeace0ad 100644
--- a/colossalai/fx/tracer/_symbolic_trace.py
+++ b/colossalai/fx/tracer/_symbolic_trace.py
@@ -13,6 +13,7 @@ def symbolic_trace(
     root: Union[torch.nn.Module, Callable[..., Any]],
     concrete_args: Optional[Dict[str, Any]] = None,
     meta_args: Optional[Dict[str, Any]] = None,
+    trace_act_ckpt=False,
 ) -> ColoGraphModule:
     """
     Symbolic tracing API
@@ -49,6 +50,6 @@ def symbolic_trace(
         This API is still under development and can incur some bugs. Feel free to report any bugs to the Colossal-AI team.
 
     """
-    graph = ColoTracer().trace(root, concrete_args=concrete_args, meta_args=meta_args)
+    graph = ColoTracer(trace_act_ckpt=trace_act_ckpt).trace(root, concrete_args=concrete_args, meta_args=meta_args)
     name = root.__class__.__name__ if isinstance(root, torch.nn.Module) else root.__name__
     return ColoGraphModule(root, graph, name)
diff --git a/colossalai/fx/tracer/experimental.py b/colossalai/fx/tracer/experimental.py
index 6fee5f5d061d..88b65b6188fa 100644
--- a/colossalai/fx/tracer/experimental.py
+++ b/colossalai/fx/tracer/experimental.py
@@ -1,7 +1,7 @@
 import enum
 import functools
-import operator
 import inspect
+import operator
 from contextlib import contextmanager
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
 
@@ -286,7 +286,6 @@ def _check_arg_name_valid(names):
         self.graph.lint()
         return self.graph
 
-
     @contextmanager
     def trace_activation_checkpoint(self, enabled: bool):
         if enabled:
@@ -316,7 +315,6 @@ def backward(ctx: Any, *grad_outputs: Any) -> Any:
             # recover the checkpoint function upon exit
             torch.utils.checkpoint.CheckpointFunction = orig_ckpt_func
 
-
     def _post_check(self, non_concrete_arg_names: Set[str]):
         # This is necessary because concrete args are added as input to the traced module since
         # https://github.com/pytorch/pytorch/pull/55888.
@@ -385,18 +383,23 @@ def symbolic_trace(
     root: Union[torch.nn.Module, Callable[..., Any]],
     concrete_args: Optional[Dict[str, Any]] = None,
     meta_args: Optional[Dict[str, Any]] = None,
+    trace_act_ckpt=False,
 ) -> ColoGraphModule:
     if is_compatible_with_meta():
         if meta_args is not None:
             root.to(default_device())
             wrap_fn = lambda x: MetaTensor(x, fake_device=default_device()) if isinstance(x, torch.Tensor) else x
-            graph = ColoTracer().trace(root, concrete_args=concrete_args, meta_args=tree_map(wrap_fn, meta_args))
+            graph = ColoTracer(trace_act_ckpt=trace_act_ckpt).trace(root,
+                                                                    concrete_args=concrete_args,
+                                                                    meta_args=tree_map(wrap_fn, meta_args))
             root.cpu()
         else:
             graph = Tracer().trace(root, concrete_args=concrete_args)
     else:
         from .tracer import ColoTracer as OrigColoTracer
-        graph = OrigColoTracer().trace(root, concrete_args=concrete_args, meta_args=meta_args)
+        graph = OrigColoTracer(trace_act_ckpt=trace_act_ckpt).trace(root,
+                                                                    concrete_args=concrete_args,
+                                                                    meta_args=meta_args)
     name = root.__class__.__name__ if isinstance(root, torch.nn.Module) else root.__name__
     return ColoGraphModule(root, graph, name)
 
@@ -471,11 +474,11 @@ def meta_prop_pass(gm: ColoGraphModule,
         node._meta_data = _meta_data_computing(meta_args, concrete_args, root, node.op, node.target, node.args,
                                                node.kwargs)
 
+
 def _meta_data_computing(meta_args, concrete_args, root, kind, target, args, kwargs):
     unwrap_fn = lambda n: n._meta_data if isinstance(n, Node) else n
     if kind == 'placeholder':
-        meta_out = meta_args[target] if target in meta_args else concrete_args.get(
-            _truncate_suffix(target), None)
+        meta_out = meta_args[target] if target in meta_args else concrete_args.get(_truncate_suffix(target), None)
     elif kind == 'get_attr':
         attr_itr = root
         atoms = target.split(".")
@@ -490,7 +493,7 @@ def _meta_data_computing(meta_args, concrete_args, root, kind, target, args, kwa
         else:
             if target not in _TensorPropertyMethod:
                 meta_out = getattr(unwrap_fn(args[0]), target)(*tree_map(unwrap_fn, args[1:]),
-                                                                       **tree_map(unwrap_fn, kwargs))
+                                                               **tree_map(unwrap_fn, kwargs))
     elif kind == 'call_module':
         mod = root.get_submodule(target)
         meta_out = mod.forward(*tree_map(unwrap_fn, args), **tree_map(unwrap_fn, kwargs))
@@ -498,6 +501,7 @@ def _meta_data_computing(meta_args, concrete_args, root, kind, target, args, kwa
         meta_out = None
     return meta_out
 
+
 def _meta_data_computing_v0(meta_args, root, kind, target, args, kwargs):
     if kind == "placeholder" and target in meta_args and meta_args[target].is_meta:
         meta_out = meta_args[target]
@@ -568,7 +572,7 @@ def _meta_data_computing_v0(meta_args, root, kind, target, args, kwargs):
     return meta_out
 
 
-def bias_addition_pass(gm: ColoGraphModule, root_model: torch.nn.Module, meta_args: Optional[Dict[str, Any]]=None):
+def bias_addition_pass(gm: ColoGraphModule, root_model: torch.nn.Module, meta_args: Optional[Dict[str, Any]] = None):
     result_graph = Graph()
     value_remap = {}
     unwrap_fn = lambda n: n._meta_data if isinstance(n, Node) else n
@@ -601,20 +605,24 @@ def wrap_fn(n):
                 if target == torch.nn.functional.linear:
                     if 'bias' in kwargs and kwargs['bias'] is not None:
                         function_to_substitute = func_to_func_dict[target]
-                        handle = bias_addition_function.get(target)(tracer, target, args_proxy, kwargs_proxy, function_to_substitute)
+                        handle = bias_addition_function.get(target)(tracer, target, args_proxy, kwargs_proxy,
+                                                                    function_to_substitute)
                 else:
                     function_to_substitute = func_to_func_dict[target]
-                    handle = bias_addition_function.get(target)(tracer, target, args_proxy, kwargs_proxy, function_to_substitute)
+                    handle = bias_addition_function.get(target)(tracer, target, args_proxy, kwargs_proxy,
+                                                                function_to_substitute)
             elif bias_addition_function.has(target.__name__):
                 # use name for some builtin op like @ (matmul)
                 function_to_substitute = func_to_func_dict[target]
-                handle = bias_addition_function.get(target.__name__)(tracer, target, args_proxy, kwargs_proxy, function_to_substitute)
+                handle = bias_addition_function.get(target.__name__)(tracer, target, args_proxy, kwargs_proxy,
+                                                                     function_to_substitute)
 
         elif kind == "call_method":
             method = getattr(args_metas[0].__class__, target)
             if bias_addition_method.has(method):
                 function_to_substitute = method_to_func_dict[method]
-                handle = bias_addition_method.get(method)(tracer, target, args_proxy, kwargs_proxy, function_to_substitute)
+                handle = bias_addition_method.get(method)(tracer, target, args_proxy, kwargs_proxy,
+                                                          function_to_substitute)
 
         elif kind == "call_module":
             # if not hasattr(self, "orig_forward"):
@@ -623,20 +631,20 @@ def wrap_fn(n):
             mod_type = type(mod)
             if bias_addition_module.has(mod_type) and mod.bias is not None:
                 function_to_substitute = module_to_func_dict[mod_type]
-                handle = bias_addition_module.get(mod_type)(tracer, target, args_proxy, kwargs_proxy, function_to_substitute)
+                handle = bias_addition_module.get(mod_type)(tracer, target, args_proxy, kwargs_proxy,
+                                                            function_to_substitute)
 
         if handle is not None:
             handle.generate()
             for node_inserted in tracer.graph.nodes:
-                value_remap[node_inserted] = result_graph.node_copy(node_inserted, lambda n : value_remap[n])
+                value_remap[node_inserted] = result_graph.node_copy(node_inserted, lambda n: value_remap[n])
                 last_node = value_remap[node_inserted]
             value_remap[orig_node] = last_node
         else:
-            value_remap[orig_node] = result_graph.node_copy(orig_node, lambda n : value_remap[n])
+            value_remap[orig_node] = result_graph.node_copy(orig_node, lambda n: value_remap[n])
 
         del tracer
 
     gm.graph = result_graph
     gm.recompile()
     meta_prop_pass(gm, root_model, meta_args)
-

From c72c827e95bf4f58ed6dd051326453a19b61c317 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 11 Jan 2023 13:56:42 +0800
Subject: [PATCH 157/209] [cli] provided more details if colossalai run fail
 (#2442)

---
 colossalai/cli/launcher/multinode_runner.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/colossalai/cli/launcher/multinode_runner.py b/colossalai/cli/launcher/multinode_runner.py
index c45ad5e5a082..a51e1e371f13 100644
--- a/colossalai/cli/launcher/multinode_runner.py
+++ b/colossalai/cli/launcher/multinode_runner.py
@@ -1,8 +1,10 @@
-import fabric
-from .hostinfo import HostInfo, HostInfoList
 from multiprocessing import Pipe, Process
 from multiprocessing import connection as mp_connection
+
 import click
+import fabric
+
+from .hostinfo import HostInfo, HostInfoList
 
 
 def run_on_host(hostinfo: HostInfo, workdir: str, recv_conn: mp_connection.Connection,
@@ -45,8 +47,10 @@ def run_on_host(hostinfo: HostInfo, workdir: str, recv_conn: mp_connection.Conne
                             # execute on the remote machine
                             fab_conn.run(cmds, hide=False)
                     send_conn.send('success')
-            except:
-                click.echo(f"Error: failed to run {cmds} on {hostinfo.hostname}")
+            except Exception as e:
+                click.echo(
+                    f"Error: failed to run {cmds} on {hostinfo.hostname}, is localhost: {hostinfo.is_local_host}, exception: {e}"
+                )
                 send_conn.send('failure')
 
     # shutdown

From 2731531bc23a93282ca5408afa3b1a329c0e331d Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Wed, 11 Jan 2023 14:03:49 +0800
Subject: [PATCH 158/209] [autoparallel] integrate device mesh initialization
 into autoparallelize (#2393)

* [autoparallel] integrate device mesh initialization into autoparallelize

* add megatron solution

* update gpt autoparallel examples with latest api

* adapt beta value to fit the current computation cost
---
 .../auto_parallel/tensor_shard/initialize.py  |  61 ++++++++++--------
 colossalai/device/alpha_beta_profiler.py      |   4 +-
 colossalai/device/device_mesh.py              |  30 +++++----
 .../auto_parallel/auto_parallel_with_gpt.py   |  20 ++----
 .../saved_solution/solution_12_layers.pt      | Bin 0 -> 1903 bytes
 .../saved_solution/solution_1_layers.pt       | Bin 0 -> 559 bytes
 .../saved_solution/solution_4_layers.pt       | Bin 0 -> 943 bytes
 7 files changed, 64 insertions(+), 51 deletions(-)
 create mode 100644 examples/language/gpt/experiments/auto_parallel/saved_solution/solution_12_layers.pt
 create mode 100644 examples/language/gpt/experiments/auto_parallel/saved_solution/solution_1_layers.pt
 create mode 100644 examples/language/gpt/experiments/auto_parallel/saved_solution/solution_4_layers.pt

diff --git a/colossalai/auto_parallel/tensor_shard/initialize.py b/colossalai/auto_parallel/tensor_shard/initialize.py
index 0dce2564c519..8c24c0d7b5df 100644
--- a/colossalai/auto_parallel/tensor_shard/initialize.py
+++ b/colossalai/auto_parallel/tensor_shard/initialize.py
@@ -59,18 +59,6 @@ def extract_meta_args_from_dataloader(data_loader: torch.utils.data.DataLoader,
     pass
 
 
-def search_best_logical_mesh_shape(world_size: int, alpha_beta_dict: Dict[Tuple[int], Tuple[float]]):
-    '''
-    This method is used to search the best logical mesh shape for the given world size
-    based on the alpha_beta_dict.
-
-    For example:
-        if the world_size is 8, and the possible logical shape will be (1, 8), (2, 4), (4, 2), (8, 1).
-    '''
-    # TODO: implement this function
-    return (world_size, 1)
-
-
 def extract_alpha_beta_for_device_mesh(alpha_beta_dict: Dict[Tuple[int], Tuple[float]], logical_mesh_shape: Tuple[int]):
     '''
     This method is used to extract the mesh_alpha and mesh_beta for the given logical_mesh_shape
@@ -127,39 +115,56 @@ def transform_to_sharded_model(gm: GraphModule, solution: List[int], device_mesh
 
 
 def initialize_device_mesh(world_size: int = -1,
+                           physical_devices: List[int] = None,
                            alpha_beta_dict: Dict[Tuple[int], Tuple[float]] = None,
-                           logical_mesh_shape: Tuple[int] = None):
+                           logical_mesh_shape: Tuple[int] = None,
+                           logical_mesh_id: torch.Tensor = None):
     '''
     This method is used to initialize the device mesh.
 
     Args:
-        world_size(optional): the size of device mesh. If the world_size is -1,
+        world_size: the size of device mesh. If the world_size is -1,
             the world size will be set to the number of GPUs in the current machine.
+        physical_devices: the physical devices used to initialize the device mesh.
         alpha_beta_dict(optional): the alpha_beta_dict contains the alpha and beta values
             for each devices. if the alpha_beta_dict is None, the alpha_beta_dict will be
             generated by profile_alpha_beta function.
         logical_mesh_shape(optional): the logical_mesh_shape is used to specify the logical
-            mesh shape. If the logical_mesh_shape is None, the logical_mesh_shape will be
-            generated by search_best_logical_mesh_shape function.
+            mesh shape.
+        logical_mesh_id(optional): the logical_mesh_id is used to specify the logical mesh id.
     '''
     # if world_size is not set, use the world size from torch.distributed
     if world_size == -1:
         world_size = dist.get_world_size()
-    device1d = [i for i in range(world_size)]
+
+    if physical_devices is None:
+        physical_devices = [i for i in range(world_size)]
+    physical_mesh = torch.tensor(physical_devices)
 
     if alpha_beta_dict is None:
         # if alpha_beta_dict is not given, use a series of executions to profile alpha and beta values for each device
-        alpha_beta_dict = profile_alpha_beta(device1d)
+        ab_profiler = AlphaBetaProfiler(physical_devices)
+        alpha_beta_dict = ab_profiler.alpha_beta_dict
+    else:
+        ab_profiler = AlphaBetaProfiler(physical_devices, alpha_beta_dict=alpha_beta_dict)
 
-    if logical_mesh_shape is None:
+    if logical_mesh_shape is None and logical_mesh_id is None:
         # search for the best logical mesh shape
-        logical_mesh_shape = search_best_logical_mesh_shape(world_size, alpha_beta_dict)
+        logical_mesh_id = ab_profiler.search_best_logical_mesh()
+        logical_mesh_id = torch.Tensor(logical_mesh_id).to(torch.int)
+        logical_mesh_shape = logical_mesh_id.shape
+
+        # extract alpha and beta values for the chosen logical mesh shape
+        mesh_alpha, mesh_beta = ab_profiler.extract_alpha_beta_for_device_mesh()
+
+    elif logical_mesh_shape is not None and logical_mesh_id is None:
+        logical_mesh_id = physical_mesh.reshape(logical_mesh_shape)
+
+        # extract alpha and beta values for the chosen logical mesh shape
+        mesh_alpha, mesh_beta = extract_alpha_beta_for_device_mesh(alpha_beta_dict, logical_mesh_id)
 
-    # extract alpha and beta values for the chosen logical mesh shape
-    mesh_alpha, mesh_beta = extract_alpha_beta_for_device_mesh(alpha_beta_dict, logical_mesh_shape)
-    physical_mesh = torch.tensor(device1d)
     device_mesh = DeviceMesh(physical_mesh_id=physical_mesh,
-                             mesh_shape=logical_mesh_shape,
+                             logical_mesh_id=logical_mesh_id,
                              mesh_alpha=mesh_alpha,
                              mesh_beta=mesh_beta,
                              init_process_group=True)
@@ -224,6 +229,7 @@ def autoparallelize(model: nn.Module,
                     data_process_func: callable = None,
                     alpha_beta_dict: Dict[Tuple[int], Tuple[float]] = None,
                     logical_mesh_shape: Tuple[int] = None,
+                    logical_mesh_id: torch.Tensor = None,
                     save_solver_solution: bool = False,
                     load_solver_solution: bool = False,
                     solver_solution_path: str = None,
@@ -245,6 +251,7 @@ def autoparallelize(model: nn.Module,
         logical_mesh_shape(optional): the logical_mesh_shape is used to specify the logical
             mesh shape. If the logical_mesh_shape is None, the logical_mesh_shape will be
             generated by search_best_logical_mesh_shape function.
+        logical_mesh_id(optional): the logical_mesh_id is used to specify the logical mesh id.
         save_solver_solution(optional): if the save_solver_solution is True, the solution will be saved
             to the solution_path.
         load_solver_solution(optional): if the load_solver_solution is True, the solution will be loaded
@@ -254,7 +261,9 @@ def autoparallelize(model: nn.Module,
         memory_budget(optional): the max cuda memory could be used. If the memory budget is -1.0,
             the memory budget will be infinity.
     '''
-    device_mesh = initialize_device_mesh(alpha_beta_dict=alpha_beta_dict, logical_mesh_shape=logical_mesh_shape)
+    device_mesh = initialize_device_mesh(alpha_beta_dict=alpha_beta_dict,
+                                         logical_mesh_shape=logical_mesh_shape,
+                                         logical_mesh_id=logical_mesh_id)
     if meta_args is None:
         meta_args = extract_meta_args_from_dataloader(data_loader, data_process_func)
 
@@ -263,7 +272,7 @@ def autoparallelize(model: nn.Module,
                                      device_mesh,
                                      save_solver_solution=save_solver_solution,
                                      load_solver_solution=load_solver_solution,
-                                     solver_solution_path=solver_solution_path,
+                                     solution_path=solver_solution_path,
                                      return_solution=return_solution,
                                      memory_budget=memory_budget)
 
diff --git a/colossalai/device/alpha_beta_profiler.py b/colossalai/device/alpha_beta_profiler.py
index 9c66cb85de5c..af2b10928c6f 100644
--- a/colossalai/device/alpha_beta_profiler.py
+++ b/colossalai/device/alpha_beta_profiler.py
@@ -381,6 +381,8 @@ def _extract_alpha_beta(pg, pg_handler):
         first_latency, first_bandwidth = _extract_alpha_beta(first_axis, first_axis_process_group)
         second_latency, second_bandwidth = _extract_alpha_beta(second_axis, second_axis_process_group)
         mesh_alpha = [first_latency, second_latency]
-        mesh_beta = [1 / first_bandwidth, 1 / second_bandwidth]
+        # The beta values have been enlarged by 1e10 times temporarilly because the computation cost
+        # is still estimated in the unit of TFLOPs instead of time. We will remove this factor in future.
+        mesh_beta = [1e10 / first_bandwidth, 1e10 / second_bandwidth]
 
         return mesh_alpha, mesh_beta
diff --git a/colossalai/device/device_mesh.py b/colossalai/device/device_mesh.py
index 7596a100bf93..b5a97eded90c 100644
--- a/colossalai/device/device_mesh.py
+++ b/colossalai/device/device_mesh.py
@@ -1,5 +1,6 @@
 import operator
 from functools import reduce
+from typing import List, Tuple
 
 import torch
 import torch.distributed as dist
@@ -15,7 +16,8 @@ class DeviceMesh:
 
     Arguments:
         physical_mesh_id (torch.Tensor): physical view of the devices in global rank.
-        mesh_shape (torch.Size): shape of logical view.
+        logical_mesh_id (torch.Tensor): logical view of the devices in global rank.
+        mesh_shape (torch.Size, optional): shape of logical view.
         mesh_alpha (List[float], optional): coefficients used for computing
             communication cost (default: None)
         mesh_beta (List[float], optional): coefficients used for computing
@@ -28,15 +30,21 @@ class DeviceMesh:
     """
 
     def __init__(self,
-                 physical_mesh_id,
-                 mesh_shape,
-                 mesh_alpha=None,
-                 mesh_beta=None,
-                 init_process_group=False,
-                 need_flatten=True):
+                 physical_mesh_id: torch.Tensor,
+                 mesh_shape: torch.Size = None,
+                 logical_mesh_id: torch.Tensor = None,
+                 mesh_alpha: List[float] = None,
+                 mesh_beta: List[float] = None,
+                 init_process_group: bool = False,
+                 need_flatten: bool = True):
         self.physical_mesh_id = physical_mesh_id
-        self.mesh_shape = mesh_shape
-        self._logical_mesh_id = self.physical_mesh_id.reshape(self.mesh_shape)
+        if logical_mesh_id is None:
+            self.mesh_shape = mesh_shape
+            self._logical_mesh_id = self.physical_mesh_id.reshape(self.mesh_shape)
+        else:
+            self._logical_mesh_id = logical_mesh_id
+            self.mesh_shape = self._logical_mesh_id.shape
+
         # map global rank into logical rank
         self.convert_map = {}
         self._global_rank_to_logical_rank_map(self._logical_mesh_id, [])
@@ -54,8 +62,8 @@ def __init__(self,
         if self.need_flatten and self._logical_mesh_id.dim() > 1:
             self.flatten_device_mesh = self.flatten()
             # Create a new member `flatten_device_meshes` to distinguish from original flatten methods (Because I'm not sure if there are functions that rely on the self.flatten())
-            self.flatten_device_meshes = FlattenDeviceMesh(self.physical_mesh_id, self.mesh_shape, self.mesh_alpha,
-                                                           self.mesh_beta)
+            # self.flatten_device_meshes = FlattenDeviceMesh(self.physical_mesh_id, self.mesh_shape, self.mesh_alpha,
+            #                                                self.mesh_beta)
 
     @property
     def shape(self):
diff --git a/examples/language/gpt/experiments/auto_parallel/auto_parallel_with_gpt.py b/examples/language/gpt/experiments/auto_parallel/auto_parallel_with_gpt.py
index 85c8d64d7809..6ceb7fd87c0a 100644
--- a/examples/language/gpt/experiments/auto_parallel/auto_parallel_with_gpt.py
+++ b/examples/language/gpt/experiments/auto_parallel/auto_parallel_with_gpt.py
@@ -16,14 +16,14 @@
 from colossalai.initialize import launch_from_torch
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 
-BATCH_SIZE = 8
-SEQ_LENGTH = 128
-HIDDEN_DIM = 3072
+BATCH_SIZE = 16
+SEQ_LENGTH = 1024
+HIDDEN_DIM = 4096
 NUM_HEADS = 16
-NUM_LAYERS = 1
+NUM_LAYERS = 4
 VOCAB_SIZE = 50257
 NUM_STEPS = 10
-FP16 = False
+FP16 = True
 
 
 def get_cpu_mem():
@@ -40,7 +40,7 @@ def get_mem_info(prefix=''):
 
 def get_tflops(model_numel, batch_size, seq_len, step_time):
     # Tflops_per_GPU = global_batch * global_numel * seq_len * 8 / #gpu
-    return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12) / 4
+    return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12) / 8
 
 
 # Randomly Generated Data
@@ -66,13 +66,7 @@ def main():
         'attention_mask': torch.zeros((BATCH_SIZE, SEQ_LENGTH), dtype=torch.int64).to('meta'),
     }
 
-    # Both device mesh initialization and model initialization will be integrated into autoparallelize
-    physical_mesh_id = torch.arange(0, 4)
-    mesh_shape = (2, 2)
-    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-
-    # Enable auto-parallel
-    gm, solution = initialize_model(model, meta_input_sample, device_mesh, return_solution=True)
+    gm, solution = autoparallelize(model, meta_input_sample, return_solution=True)
 
     # print solution on rank 0
     if gpc.get_global_rank() == 0:
diff --git a/examples/language/gpt/experiments/auto_parallel/saved_solution/solution_12_layers.pt b/examples/language/gpt/experiments/auto_parallel/saved_solution/solution_12_layers.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7b8cd7edd11e6d1f605e0e9f992b6a13676ecd10
GIT binary patch
literal 1903
zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfho3-LBpE?19EtC
zc(Zvkdvil&n1O5#AV!C5-n>9@5CoYa3gm$x8x%4FX(k|K2dm)6Fbkp+L4xeSrykYz
zQM*UOd^D|%<}pzDF<Q33%4bAr3SnU@Q`x*3QuPA7**Si!U0L*pm4N|-18~+Q{0!{y
zx}+?%s5mn}&&`P$RFxQWflRqLFFlkASObA@fHxzEg4apNHVA+uPyl)jg{~XfWPTK#
zK5Pu=ZVK>bW7B~ul4I6|D+jv=$b-?<P?<?UBR~MCj~ztwf=LDjMz9zwSe6N>3#1;R
F768pwDM|nU

literal 0
HcmV?d00001

diff --git a/examples/language/gpt/experiments/auto_parallel/saved_solution/solution_1_layers.pt b/examples/language/gpt/experiments/auto_parallel/saved_solution/solution_1_layers.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9b431a45baba43b9581fb5cf3d4bf39a2aaea5d6
GIT binary patch
literal 559
zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfho3-LBpE?19EtC
zc(Zvkdvil&n1O5#AV!C5-n>9@5CoYa3gm$x8x%4FX(k|K2dm)6Fbkp+L4xc+SI_3n
zkg6Bp&Can)mSgS;pg%!40H?qC8KmL<E=w&c&dkqqa}r1MHarZBxj-ggoR=QT1T+<d
z1H2hQ6g*6kZ504XpaAp`M%RsOGCzvWA3z>VH`JX0-fV0-P(^agx^U%S_W*e?x*94I
d#0X&k^|6CkXQ6x$72wUv1`=ZeLXdihS^z_{RyzOy

literal 0
HcmV?d00001

diff --git a/examples/language/gpt/experiments/auto_parallel/saved_solution/solution_4_layers.pt b/examples/language/gpt/experiments/auto_parallel/saved_solution/solution_4_layers.pt
new file mode 100644
index 0000000000000000000000000000000000000000..79a448c1b06f1db8731d2d45f988ff0b57810b04
GIT binary patch
literal 943
zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfho3-LBpE?19EtC
zc(Zvkdvil&n1O5#AV!C5-n>9@5CoYa3gm$x8x%4FX(k|K2dm)6Fbkp+L4xeSrykYz
zQM+js=4{>!sd@q4>>LMcl}lup7#Kh}0B2g`XHbWywX)Qr;>`R!Hz#GZq=u62U>svE
zkS!PIrH2A7U;yC&Z$=OWPt(XQ5CBP_0Q3}&t{d58eiWTKKwDtCp>7WFW@FQVDw1Q?
qg)0ZU2grlb)livBPywJmc94)SGem+BNCkMavVnL^KnPL~Q40VIP?Na;

literal 0
HcmV?d00001


From 5521af7877cebf1f3147dd9d60224e20a3733b8f Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Wed, 11 Jan 2023 14:55:41 +0800
Subject: [PATCH 159/209] [zero] fix state_dict and load_state_dict for ddp
 ignored parameters (#2443)

* [ddp] add is_ddp_ignored

[ddp] rename to is_ddp_ignored

* [zero] fix state_dict and load_state_dict

* fix bugs

* [zero] update unit test for ZeroDDP
---
 colossalai/nn/parallel/data_parallel.py       | 22 +++++++++++++++----
 .../update/test_zeroddp_state_dict.py         | 12 ++++++++--
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/colossalai/nn/parallel/data_parallel.py b/colossalai/nn/parallel/data_parallel.py
index 649bd920d3b2..28a10c4b6c92 100644
--- a/colossalai/nn/parallel/data_parallel.py
+++ b/colossalai/nn/parallel/data_parallel.py
@@ -233,7 +233,7 @@ def __init__(self,
             assert isinstance(p, ColoParameter)
 
             if is_ddp_ignored(p):
-                p.data = p.data.half()
+                p.data = p.data.to(device=get_current_device(), dtype=torch.float16)
                 continue
 
             fp32_data = p.data.float()
@@ -451,8 +451,14 @@ def _save_to_state_dict(self, destination, prefix, keep_vars, only_rank_0=True):
         assert keep_vars is False, "`state_dict` with parameter, `keep_vars=True`, is not supported now."
 
         param_to_save_data = self._get_param_to_save_data(self.fp32_params, only_rank_0)
-        # TODO: (HELSON) deal with ddp ignored parameters
-        for (name, p), fp32_p in zip(self.named_parameters(), self.fp32_params):
+        ddp_param_list = []
+        for name, param in self.named_parameters():
+            if is_ddp_ignored(param):
+                # deal with ddp ignored parameters
+                destination[prefix + name] = param if keep_vars else param.detach()
+            else:
+                ddp_param_list.append((name, param))
+        for (name, p), fp32_p in zip(ddp_param_list, self.fp32_params):
             if p is not None:
                 assert fp32_p in param_to_save_data, "Parameter '{}' is neglected in the chunk list".format(name)
                 record_parameter = param_to_save_data[fp32_p]
@@ -588,8 +594,16 @@ def load(param_name, dest_tensor, copy_func):
         def load_fp32_parameter(chunk_slice, data):
             chunk_slice.copy_(data.flatten())
 
+        ddp_param_list = []
+        for name, param in self.named_parameters():
+            if is_ddp_ignored(param):
+                # deal with ddp ignored parameters
+                load(name, param, param.copy_)
+            else:
+                ddp_param_list.append((name, param))
+
         fp32_to_name = dict()
-        for (name, p), fp32_p in zip(self.named_parameters(), self.fp32_params):
+        for (name, p), fp32_p in zip(ddp_param_list, self.fp32_params):
             if p is not None:
                 fp32_to_name[fp32_p] = name
 
diff --git a/tests/test_gemini/update/test_zeroddp_state_dict.py b/tests/test_gemini/update/test_zeroddp_state_dict.py
index b902bb0f010e..266b8eab121b 100644
--- a/tests/test_gemini/update/test_zeroddp_state_dict.py
+++ b/tests/test_gemini/update/test_zeroddp_state_dict.py
@@ -4,6 +4,7 @@
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
+from torch.testing import assert_close
 
 import colossalai
 from colossalai.gemini.chunk import ChunkManager, search_chunk_configuration
@@ -17,6 +18,13 @@
 from tests.test_tensor.common_utils import debug_print, set_seed
 
 
+def ignore_the_first_parameter(model: torch.nn.Module):
+    for name, param in model.named_parameters():
+        print(f"parameter `{name}` is set ignored")
+        ZeroDDP.set_params_to_ignore([param])
+        return
+
+
 @parameterize('placement_policy', ['cuda', 'cpu', 'auto'])
 @parameterize('keep_gathered', [True, False])
 @parameterize('model_name', ['gpt2', 'bert'])
@@ -47,7 +55,7 @@ def exam_state_dict(placement_policy, keep_gathered, model_name: str):
     for key, value in torch_dict.items():
         assert key in zero_dict, "{} not in ZeRO dictionary.".format(key)
         temp_zero_value = zero_dict[key].to(device=value.device, dtype=value.dtype)
-        assert torch.equal(value, temp_zero_value), "parameter '{}' has problem.".format(key)
+        assert_close(value, temp_zero_value, rtol=1e-3, atol=1e-5)
 
 
 @parameterize('placement_policy', ['cuda', 'cpu', 'auto'])
@@ -84,7 +92,7 @@ def exam_load_state_dict(placement_policy, keep_gathered, model_name: str):
     for key, value in torch_dict.items():
         assert key in zero_dict, "{} not in ZeRO dictionary.".format(key)
         temp_zero_value = zero_dict[key].to(device=value.device, dtype=value.dtype)
-        assert torch.equal(value, temp_zero_value), "parameter '{}' has problem.".format(key)
+        assert_close(value, temp_zero_value, rtol=1e-3, atol=1e-5)
 
 
 def run_dist(rank, world_size, port):

From 39163417a1462516ac251766439411bfa203e217 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 11 Jan 2023 15:17:17 +0800
Subject: [PATCH 160/209] [example] updated the hybrid parallel tutorial
 (#2444)

* [example] updated the hybrid parallel tutorial

* polish code
---
 colossalai/cli/launcher/run.py                | 52 +++++++++++++++----
 examples/tutorial/hybrid_parallel/README.md   | 47 ++++++++---------
 examples/tutorial/hybrid_parallel/config.py   |  6 +--
 .../tutorial/hybrid_parallel/requirements.txt |  6 +--
 examples/tutorial/hybrid_parallel/test_ci.sh  |  2 +-
 examples/tutorial/hybrid_parallel/train.py    | 22 +++-----
 6 files changed, 76 insertions(+), 59 deletions(-)

diff --git a/colossalai/cli/launcher/run.py b/colossalai/cli/launcher/run.py
index e078a57c15c9..6411b4302e95 100644
--- a/colossalai/cli/launcher/run.py
+++ b/colossalai/cli/launcher/run.py
@@ -1,12 +1,15 @@
-import click
-import sys
 import os
+import sys
+from typing import List
+
+import click
 import torch
+from packaging import version
+
 from colossalai.context import Config
-from .multinode_runner import MultiNodeRunner
+
 from .hostinfo import HostInfo, HostInfoList
-from typing import List
-from packaging import version
+from .multinode_runner import MultiNodeRunner
 
 # Constants that define our syntax
 NODE_SEP = ','
@@ -15,7 +18,7 @@
 def fetch_hostfile(hostfile_path: str, ssh_port: int) -> HostInfoList:
     """
     Parse the hostfile to obtain a list of hosts.
-    
+
     A hostfile should look like:
     worker-0
     worker-1
@@ -63,7 +66,7 @@ def parse_device_filter(device_pool: HostInfoList, include_str=None, exclude_str
         device_pool (HostInfoList): a list of HostInfo objects
         include_str (str): --include option passed by user, default None
         exclude_str (str): --exclude option passed by user, default None
-    
+
     Returns:
         filtered_hosts (HostInfoList): filtered hosts after inclusion/exclusion
     '''
@@ -192,7 +195,7 @@ def launch_multi_processes(args: Config) -> None:
     Launch multiple processes on a single node or multiple nodes.
 
     The overall logic can be summarized as the pseudo code below:
-    
+
         if hostfile given:
             hostinfo = parse_hostfile(hostfile)
             hostinfo = include_or_exclude_hosts(hostinfo)
@@ -202,7 +205,7 @@ def launch_multi_processes(args: Config) -> None:
             launch_on_multi_nodes(hostinfo)
         else:
             launch_on_current_node()
-    
+
     Args:
         args (Config): the arguments taken from command line
 
@@ -276,6 +279,33 @@ def launch_multi_processes(args: Config) -> None:
                                  extra_launch_args=args.extra_launch_args)
         runner.send(hostinfo=hostinfo, cmd=cmd)
 
-    runner.recv_from_all()
+    # start training
+    msg_from_node = runner.recv_from_all()
+    has_error = False
+
+    # print node status
+    click.echo("\n====== Training on All Nodes =====")
+    for hostname, msg in msg_from_node.items():
+        click.echo(f"{hostname}: {msg}")
+
+        # check if a process failed
+        if msg == "failure":
+            has_error = True
+
+    # stop all nodes
     runner.stop_all()
-    runner.recv_from_all()
+
+    # receive the stop status
+    msg_from_node = runner.recv_from_all()
+
+    # printe node status
+    click.echo("\n====== Stopping All Nodes =====")
+    for hostname, msg in msg_from_node.items():
+        click.echo(f"{hostname}: {msg}")
+
+    # give the process an exit code
+    # so that it behaves like a normal process
+    if has_error:
+        sys.exit(1)
+    else:
+        sys.exit(0)
diff --git a/examples/tutorial/hybrid_parallel/README.md b/examples/tutorial/hybrid_parallel/README.md
index 6f975e86330a..1b5e54f928d4 100644
--- a/examples/tutorial/hybrid_parallel/README.md
+++ b/examples/tutorial/hybrid_parallel/README.md
@@ -1,45 +1,40 @@
 # Multi-dimensional Parallelism with Colossal-AI
 
+## Table of contents
 
-## 🚀Quick Start
-1. Install our model zoo.
-```bash
-pip install titans
-```
-2. Run with synthetic data which is of similar shape to CIFAR10 with the `-s` flag.
-```bash
-colossalai run --nproc_per_node 4 train.py --config config.py -s
-```
+- [Overview](#-overview)
+- [Quick Start](#-quick-start)
 
-3. Modify the config file to play with different types of tensor parallelism, for example, change tensor parallel size to be 4 and mode to be 2d and run on 8 GPUs.
+## 📚 Overview
 
+This example lets you to quickly try out the hybrid parallelism provided by Colossal-AI.
+You can change the parameters below to try out different settings in the `config.py`.
 
-## Install Titans Model Zoo
+```python
+# parallel setting
+TENSOR_PARALLEL_SIZE = 2
+TENSOR_PARALLEL_MODE = '1d'
 
-```bash
-pip install titans
+parallel = dict(
+    pipeline=2,
+    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
+)
 ```
 
+## 🚀 Quick Start
 
-## Prepare Dataset
+1. Install PyTorch
 
-We use CIFAR10 dataset in this example. You should invoke the `donwload_cifar10.py` in the tutorial root directory or directly run the `auto_parallel_with_resnet.py`.
-The dataset will be downloaded to `colossalai/examples/tutorials/data` by default.
-If you wish to use customized directory for the dataset. You can set the environment variable `DATA` via the following command.
+2. Install the dependencies.
 
 ```bash
-export DATA=/path/to/data
+pip install -r requirements.txt
 ```
 
-
-## Run on 2*2 device mesh
-
-Current configuration setting on `config.py` is TP=2, PP=2.
+3. Run the training scripts with synthetic data.
 
 ```bash
-# train with cifar10
 colossalai run --nproc_per_node 4 train.py --config config.py
-
-# train with synthetic data
-colossalai run --nproc_per_node 4 train.py --config config.py -s
 ```
+
+4. Modify the config file to play with different types of tensor parallelism, for example, change tensor parallel size to be 4 and mode to be 2d and run on 8 GPUs.
diff --git a/examples/tutorial/hybrid_parallel/config.py b/examples/tutorial/hybrid_parallel/config.py
index ac273c305006..fe9abf2f1955 100644
--- a/examples/tutorial/hybrid_parallel/config.py
+++ b/examples/tutorial/hybrid_parallel/config.py
@@ -3,7 +3,7 @@
 # hyperparameters
 # BATCH_SIZE is as per GPU
 # global batch size = BATCH_SIZE x data parallel size
-BATCH_SIZE = 256
+BATCH_SIZE = 4
 LEARNING_RATE = 3e-3
 WEIGHT_DECAY = 0.3
 NUM_EPOCHS = 2
@@ -12,11 +12,11 @@
 # model config
 IMG_SIZE = 224
 PATCH_SIZE = 16
-HIDDEN_SIZE = 512
+HIDDEN_SIZE = 128
 DEPTH = 4
 NUM_HEADS = 4
 MLP_RATIO = 2
-NUM_CLASSES = 1000
+NUM_CLASSES = 10
 CHECKPOINT = False
 SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE)**2 + 1    # add 1 for cls token
 
diff --git a/examples/tutorial/hybrid_parallel/requirements.txt b/examples/tutorial/hybrid_parallel/requirements.txt
index dbf6aaf3e4e2..99b7ecfe162e 100644
--- a/examples/tutorial/hybrid_parallel/requirements.txt
+++ b/examples/tutorial/hybrid_parallel/requirements.txt
@@ -1,3 +1,3 @@
-colossalai >= 0.1.12
-torch >= 1.8.1
-titans
\ No newline at end of file
+torch
+colossalai
+titans
diff --git a/examples/tutorial/hybrid_parallel/test_ci.sh b/examples/tutorial/hybrid_parallel/test_ci.sh
index 8860b72a2fb3..e0dbef354e2d 100644
--- a/examples/tutorial/hybrid_parallel/test_ci.sh
+++ b/examples/tutorial/hybrid_parallel/test_ci.sh
@@ -2,4 +2,4 @@
 set -euxo pipefail
 
 pip install -r requirements.txt
-torchrun --standalone --nproc_per_node 4 train.py --config config.py -s
+colossalai run --nproc_per_node 4 train.py --config config.py
diff --git a/examples/tutorial/hybrid_parallel/train.py b/examples/tutorial/hybrid_parallel/train.py
index 2a8576db747b..4953d5350f31 100644
--- a/examples/tutorial/hybrid_parallel/train.py
+++ b/examples/tutorial/hybrid_parallel/train.py
@@ -1,7 +1,6 @@
 import os
 
 import torch
-from titans.dataloader.cifar10 import build_cifar
 from titans.model.vit.vit import _create_vit_model
 from tqdm import tqdm
 
@@ -12,7 +11,7 @@
 from colossalai.nn import CrossEntropyLoss
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
 from colossalai.pipeline.pipelinable import PipelinableContext
-from colossalai.utils import get_dataloader, is_using_pp
+from colossalai.utils import is_using_pp
 
 
 class DummyDataloader():
@@ -42,12 +41,9 @@ def __len__(self):
 
 
 def main():
-    # initialize distributed setting
+    # launch from torch
     parser = colossalai.get_default_parser()
-    parser.add_argument('-s', '--synthetic', action="store_true", help="whether use synthetic data")
     args = parser.parse_args()
-
-    # launch from torch
     colossalai.launch_from_torch(config=args.config)
 
     # get logger
@@ -94,15 +90,10 @@ def main():
         pipeline_stage = gpc.get_local_rank(ParallelMode.PIPELINE)
     logger.info(f"number of parameters: {total_numel} on pipeline stage {pipeline_stage}")
 
-    # create dataloaders
-    root = os.environ.get('DATA', '../data')
-    if args.synthetic:
-        # if we use synthetic dataset
-        # we train for 10 steps and eval for 5 steps per epoch
-        train_dataloader = DummyDataloader(length=10, batch_size=gpc.config.BATCH_SIZE)
-        test_dataloader = DummyDataloader(length=5, batch_size=gpc.config.BATCH_SIZE)
-    else:
-        train_dataloader, test_dataloader = build_cifar(gpc.config.BATCH_SIZE, root, pad_if_needed=True)
+    # use synthetic dataset
+    # we train for 10 steps and eval for 5 steps per epoch
+    train_dataloader = DummyDataloader(length=10, batch_size=gpc.config.BATCH_SIZE)
+    test_dataloader = DummyDataloader(length=5, batch_size=gpc.config.BATCH_SIZE)
 
     # create loss function
     criterion = CrossEntropyLoss(label_smoothing=0.1)
@@ -139,6 +130,7 @@ def main():
             engine.execute_schedule(data_iter, return_output_label=False)
             engine.step()
             lr_scheduler.step()
+    gpc.destroy()
 
 
 if __name__ == '__main__':

From 2bfeb24308aa8c55e7a2c8ea42eb87a680618b50 Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Wed, 11 Jan 2023 15:30:09 +0800
Subject: [PATCH 161/209] [zero] add warning for ignored parameters (#2446)

---
 colossalai/gemini/chunk/utils.py          |  9 +++++++--
 colossalai/nn/optimizer/zero_optimizer.py | 15 +++++++++++++--
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/colossalai/gemini/chunk/utils.py b/colossalai/gemini/chunk/utils.py
index 883022fe89b8..ebfdee778979 100644
--- a/colossalai/gemini/chunk/utils.py
+++ b/colossalai/gemini/chunk/utils.py
@@ -10,13 +10,18 @@
 from colossalai.utils import is_ddp_ignored
 
 
+def safe_div(a, b):
+    if a == 0:
+        return 0
+    return a / b
+
+
 def init_chunk_manager(model: nn.Module,
                        init_device: Optional[torch.device] = None,
                        hidden_dim: Optional[int] = None,
                        search_range_mb: Optional[float] = None,
                        min_chunk_size_mb: Optional[float] = None,
                        filter_exlarge_params: Optional[bool] = None) -> ChunkManager:
-
     kwargs_dict = dict()
 
     if hidden_dim:
@@ -50,7 +55,7 @@ def init_chunk_manager(model: nn.Module,
     if dist.get_rank() == 0:
         print("searching chunk configuration is completed in {:.2f} s.\n".format(span_s),
               "used number: {:.2f} MB, wasted number: {:.2f} MB\n".format(total_size, wasted_size),
-              "total wasted percentage is {:.2f}%".format(100 * wasted_size / (total_size + wasted_size)),
+              "total wasted percentage is {:.2f}%".format(100 * safe_div(wasted_size, total_size + wasted_size)),
               sep='',
               flush=True)
     dist.barrier()
diff --git a/colossalai/nn/optimizer/zero_optimizer.py b/colossalai/nn/optimizer/zero_optimizer.py
index 3dd9d1e93b36..9f761efdb12c 100644
--- a/colossalai/nn/optimizer/zero_optimizer.py
+++ b/colossalai/nn/optimizer/zero_optimizer.py
@@ -1,4 +1,5 @@
 import math
+import warnings
 from enum import Enum
 from typing import Any, Dict, Set, Tuple
 
@@ -78,8 +79,16 @@ def __init__(self,
         if self.clipping_flag:
             assert norm_type == 2.0, "ZeroOptimizer only supports L2 norm now"
 
-        params_list = [p for p in module.parameters() if not is_ddp_ignored(p)]
-        for p, fp32_p in zip(params_list, module.fp32_params):
+        ddp_param_list = []
+        for name, param in module.named_parameters():
+            if is_ddp_ignored(param):
+                if param.requires_grad:
+                    warnings.warn(f"Parameter `{name}` is ignored by DDP but requires gradient! "
+                                  "You should handle its optimizer update by yourself!")
+            else:
+                ddp_param_list.append(param)
+
+        for p, fp32_p in zip(ddp_param_list, module.fp32_params):
             chunk_16 = self.chunk_manager.get_chunk(p)
             if chunk_16 not in self.chunk16_set:
                 chunk_16.l2_norm_flag = self.clipping_flag
@@ -290,6 +299,8 @@ def get_range_pair(local_chunk: Chunk, local_param: Parameter):
             fake_params_list = list()
 
             for param in group['params']:
+                if is_ddp_ignored(param):
+                    continue
                 chunk16 = self.chunk_manager.get_chunk(param)
                 range_pair = get_range_pair(chunk16, param)
                 if range_pair[0] >= range_pair[1]:

From ac18a445fafae6378d97d605f5d3edfb915666d9 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 11 Jan 2023 16:27:31 +0800
Subject: [PATCH 162/209] [example] updated large-batch optimizer tutorial
 (#2448)

* [example] updated large-batch optimizer tutorial

* polish code

* polish code
---
 .../tutorial/large_batch_optimizer/README.md  | 38 +++++-----
 .../tutorial/large_batch_optimizer/config.py  | 26 +------
 .../large_batch_optimizer/requirements.txt    |  5 +-
 .../tutorial/large_batch_optimizer/test_ci.sh |  8 ++
 .../tutorial/large_batch_optimizer/train.py   | 76 +++++--------------
 5 files changed, 53 insertions(+), 100 deletions(-)
 create mode 100644 examples/tutorial/large_batch_optimizer/test_ci.sh

diff --git a/examples/tutorial/large_batch_optimizer/README.md b/examples/tutorial/large_batch_optimizer/README.md
index 20bddb383434..d85afa427518 100644
--- a/examples/tutorial/large_batch_optimizer/README.md
+++ b/examples/tutorial/large_batch_optimizer/README.md
@@ -1,31 +1,35 @@
 # Comparison of Large Batch Training Optimization
 
-## 🚀Quick Start
-Run with synthetic data
-```bash
-colossalai run --nproc_per_node 4 train.py --config config.py -s
-```
+## Table of contents
 
+- [Overview](#-overview)
+- [Quick Start](#-quick-start)
 
-## Prepare Dataset
+## 📚 Overview
 
-We use CIFAR10 dataset in this example. You should invoke the `donwload_cifar10.py` in the tutorial root directory or directly run the `auto_parallel_with_resnet.py`.
-The dataset will be downloaded to `colossalai/examples/tutorials/data` by default.
-If you wish to use customized directory for the dataset. You can set the environment variable `DATA` via the following command.
+This example lets you to quickly try out the large batch training optimization provided by Colossal-AI. We use synthetic dataset to go through the process, thus, you don't need to prepare any dataset. You can try out the `Lamb` and `Lars` optimizers from Colossal-AI with the following code.
 
-```bash
-export DATA=/path/to/data
+```python
+from colossalai.nn.optimizer import Lamb, Lars
 ```
 
-You can also use synthetic data for this tutorial if you don't wish to download the `CIFAR10` dataset by adding the `-s` or `--synthetic` flag to the command.
+## 🚀 Quick Start
+
+1. Install PyTorch
 
+2. Install the dependencies.
+
+```bash
+pip install -r requirements.txt
+```
 
-## Run on 2*2 device mesh
+3. Run the training scripts with synthetic data.
 
 ```bash
-# run with cifar10
-colossalai run --nproc_per_node 4 train.py --config config.py
+# run on 4 GPUs
+# run with lars
+colossalai run --nproc_per_node 4 train.py --config config.py --optimizer lars
 
-# run with synthetic dataset
-colossalai run --nproc_per_node 4 train.py --config config.py -s
+# run with lamb
+colossalai run --nproc_per_node 4 train.py --config config.py --optimizer lamb
 ```
diff --git a/examples/tutorial/large_batch_optimizer/config.py b/examples/tutorial/large_batch_optimizer/config.py
index e019154e4b12..2efa0ffd0556 100644
--- a/examples/tutorial/large_batch_optimizer/config.py
+++ b/examples/tutorial/large_batch_optimizer/config.py
@@ -6,31 +6,11 @@
 BATCH_SIZE = 512
 LEARNING_RATE = 3e-3
 WEIGHT_DECAY = 0.3
-NUM_EPOCHS = 10
-WARMUP_EPOCHS = 3
+NUM_EPOCHS = 2
+WARMUP_EPOCHS = 1
 
 # model config
-IMG_SIZE = 224
-PATCH_SIZE = 16
-HIDDEN_SIZE = 512
-DEPTH = 4
-NUM_HEADS = 4
-MLP_RATIO = 2
-NUM_CLASSES = 1000
-CHECKPOINT = False
-SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE)**2 + 1    # add 1 for cls token
-
-# parallel setting
-TENSOR_PARALLEL_SIZE = 2
-TENSOR_PARALLEL_MODE = '1d'
-
-parallel = dict(
-    pipeline=2,
-    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
-)
+NUM_CLASSES = 10
 
 fp16 = dict(mode=AMP_TYPE.NAIVE)
 clip_grad_norm = 1.0
-
-# pipeline config
-NUM_MICRO_BATCHES = parallel['pipeline']
diff --git a/examples/tutorial/large_batch_optimizer/requirements.txt b/examples/tutorial/large_batch_optimizer/requirements.txt
index 137a69e80498..c013287751bf 100644
--- a/examples/tutorial/large_batch_optimizer/requirements.txt
+++ b/examples/tutorial/large_batch_optimizer/requirements.txt
@@ -1,2 +1,3 @@
-colossalai >= 0.1.12
-torch >= 1.8.1
+colossalai
+torch
+titans
diff --git a/examples/tutorial/large_batch_optimizer/test_ci.sh b/examples/tutorial/large_batch_optimizer/test_ci.sh
new file mode 100644
index 000000000000..89f426c542b1
--- /dev/null
+++ b/examples/tutorial/large_batch_optimizer/test_ci.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -euxo pipefail
+
+pip install -r requirements.txt
+
+# run test
+colossalai run --nproc_per_node 4 --master_port 29500 train.py --config config.py --optimizer lars
+colossalai run --nproc_per_node 4 --master_port 29501 train.py --config config.py --optimizer lamb
diff --git a/examples/tutorial/large_batch_optimizer/train.py b/examples/tutorial/large_batch_optimizer/train.py
index d403c275d1af..35e54582f494 100644
--- a/examples/tutorial/large_batch_optimizer/train.py
+++ b/examples/tutorial/large_batch_optimizer/train.py
@@ -1,19 +1,13 @@
-import os
-
 import torch
-from titans.dataloader.cifar10 import build_cifar
-from titans.model.vit.vit import _create_vit_model
+import torch.nn as nn
+from torchvision.models import resnet18
 from tqdm import tqdm
 
 import colossalai
-from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
-from colossalai.nn import CrossEntropyLoss
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
 from colossalai.nn.optimizer import Lamb, Lars
-from colossalai.pipeline.pipelinable import PipelinableContext
-from colossalai.utils import get_dataloader, is_using_pp
 
 
 class DummyDataloader():
@@ -45,7 +39,10 @@ def __len__(self):
 def main():
     # initialize distributed setting
     parser = colossalai.get_default_parser()
-    parser.add_argument('-s', '--synthetic', action="store_true", help="whether use synthetic data")
+    parser.add_argument('--optimizer',
+                        choices=['lars', 'lamb'],
+                        help="Choose your large-batch optimizer",
+                        required=True)
     args = parser.parse_args()
 
     # launch from torch
@@ -55,59 +52,22 @@ def main():
     logger = get_dist_logger()
     logger.info("initialized distributed environment", ranks=[0])
 
-    if hasattr(gpc.config, 'LOG_PATH'):
-        if gpc.get_global_rank() == 0:
-            log_path = gpc.config.LOG_PATH
-            if not os.path.exists(log_path):
-                os.mkdir(log_path)
-            logger.log_to_file(log_path)
-
-    use_pipeline = is_using_pp()
-
-    # create model
-    model_kwargs = dict(img_size=gpc.config.IMG_SIZE,
-                        patch_size=gpc.config.PATCH_SIZE,
-                        hidden_size=gpc.config.HIDDEN_SIZE,
-                        depth=gpc.config.DEPTH,
-                        num_heads=gpc.config.NUM_HEADS,
-                        mlp_ratio=gpc.config.MLP_RATIO,
-                        num_classes=10,
-                        init_method='jax',
-                        checkpoint=gpc.config.CHECKPOINT)
-
-    if use_pipeline:
-        pipelinable = PipelinableContext()
-        with pipelinable:
-            model = _create_vit_model(**model_kwargs)
-        pipelinable.to_layer_list()
-        pipelinable.policy = "uniform"
-        model = pipelinable.partition(1, gpc.pipeline_parallel_size, gpc.get_local_rank(ParallelMode.PIPELINE))
-    else:
-        model = _create_vit_model(**model_kwargs)
-
-    # count number of parameters
-    total_numel = 0
-    for p in model.parameters():
-        total_numel += p.numel()
-    if not gpc.is_initialized(ParallelMode.PIPELINE):
-        pipeline_stage = 0
-    else:
-        pipeline_stage = gpc.get_local_rank(ParallelMode.PIPELINE)
-    logger.info(f"number of parameters: {total_numel} on pipeline stage {pipeline_stage}")
-
-    # create dataloaders
-    root = os.environ.get('DATA', '../data/')
-    if args.synthetic:
-        train_dataloader = DummyDataloader(length=30, batch_size=gpc.config.BATCH_SIZE)
-        test_dataloader = DummyDataloader(length=10, batch_size=gpc.config.BATCH_SIZE)
-    else:
-        train_dataloader, test_dataloader = build_cifar(gpc.config.BATCH_SIZE, root, pad_if_needed=True)
+    # create synthetic dataloaders
+    train_dataloader = DummyDataloader(length=10, batch_size=gpc.config.BATCH_SIZE)
+    test_dataloader = DummyDataloader(length=5, batch_size=gpc.config.BATCH_SIZE)
+
+    # build model
+    model = resnet18(num_classes=gpc.config.NUM_CLASSES)
 
     # create loss function
-    criterion = CrossEntropyLoss(label_smoothing=0.1)
+    criterion = nn.CrossEntropyLoss()
 
     # create optimizer
-    optimizer = Lars(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
+    if args.optimizer == "lars":
+        optim_cls = Lars
+    elif args.optimizer == "lamb":
+        optim_cls = Lamb
+    optimizer = optim_cls(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
 
     # create lr scheduler
     lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,

From cfd1d5ee4970ecf168a09ca6d5f187b4520eaea3 Mon Sep 17 00:00:00 2001
From: Haofan Wang <haofanwang.ai@gmail.com>
Date: Wed, 11 Jan 2023 16:56:15 +0800
Subject: [PATCH 163/209] [example] fixed seed error in
 train_dreambooth_colossalai.py (#2445)

---
 examples/images/dreambooth/train_dreambooth_colossalai.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/images/dreambooth/train_dreambooth_colossalai.py b/examples/images/dreambooth/train_dreambooth_colossalai.py
index b7e24bfe4a15..7c90b939abaa 100644
--- a/examples/images/dreambooth/train_dreambooth_colossalai.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai.py
@@ -355,10 +355,11 @@ def gemini_zero_dpp(model: torch.nn.Module, placememt_policy: str = "auto"):
 
 
 def main(args):
-    colossalai.launch_from_torch(config={})
 
-    if args.seed is not None:
-        gpc.set_seed(args.seed)
+    if args.seed is None:
+        colossalai.launch_from_torch(config={})
+    else:
+        colossalai.launch_from_torch(config={}, seed=args.seed)
 
     if args.with_prior_preservation:
         class_images_dir = Path(args.class_data_dir)

From 483efdabc5875545391f88c5fa7a71ff02800d58 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 11 Jan 2023 17:22:11 +0800
Subject: [PATCH 164/209] [workflow] fixed the on-merge condition check (#2452)

---
 .github/workflows/draft_github_release_post.yml | 3 +--
 .github/workflows/release_docker.yml            | 9 ++++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/draft_github_release_post.yml b/.github/workflows/draft_github_release_post.yml
index 413714dafa86..53bfa9e8deb6 100644
--- a/.github/workflows/draft_github_release_post.yml
+++ b/.github/workflows/draft_github_release_post.yml
@@ -8,11 +8,10 @@ on:
     types:
       - closed
 
-
 jobs:
   release:
     name: Draft Release Post
-    if: github.repository == 'hpcaitech/ColossalAI'
+    if: ( github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true ) && github.repository == 'hpcaitech/ColossalAI'
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
diff --git a/.github/workflows/release_docker.yml b/.github/workflows/release_docker.yml
index c72d3fb33edd..8da6e5f87606 100644
--- a/.github/workflows/release_docker.yml
+++ b/.github/workflows/release_docker.yml
@@ -2,13 +2,16 @@ name: Publish Docker Image to DockerHub
 
 on:
   workflow_dispatch:
-  release:
-    types: [published]
+  pull_request:
+    paths:
+      - 'version.txt'
+    types:
+      - closed
 
 jobs:
   release:
     name: Publish Docker Image to DockerHub
-    if: github.repository == 'hpcaitech/ColossalAI'
+    if: ( github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true ) && github.repository == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
       image: "hpcaitech/docker-in-docker:latest"

From c9ec5190a076b130c72ab8a86c35626ac6e3d5e7 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 11 Jan 2023 23:40:16 +0800
Subject: [PATCH 165/209] [workflow] automated the compatiblity test (#2453)

* [workflow] automated the compatiblity test

* polish code
---
 .compatibility                                |  3 +
 .github/workflows/README.md                   | 52 ++++++++-----
 .github/workflows/auto_compatibility_test.yml | 74 +++++++++++++++++++
 ...st.yml => dispatch_compatibility_test.yml} |  2 +-
 4 files changed, 110 insertions(+), 21 deletions(-)
 create mode 100644 .compatibility
 create mode 100644 .github/workflows/auto_compatibility_test.yml
 rename .github/workflows/{compatibility_test.yml => dispatch_compatibility_test.yml} (98%)

diff --git a/.compatibility b/.compatibility
new file mode 100644
index 000000000000..c8ac4083d2a2
--- /dev/null
+++ b/.compatibility
@@ -0,0 +1,3 @@
+1.12.0-11.3.0
+1.11.0-11.3.0
+1.10.1-11.3.0
diff --git a/.github/workflows/README.md b/.github/workflows/README.md
index 65017a397c81..bc1f8504df3c 100644
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@@ -14,6 +14,7 @@
       - [Dispatch Example Test](#dispatch-example-test)
       - [Compatibility Test](#compatibility-test)
     - [User Friendliness](#user-friendliness)
+  - [Configuration](#configuration)
   - [Progress Log](#progress-log)
 
 ## Overview
@@ -37,30 +38,32 @@ In the section below, we will dive into the details of different workflows avail
 
 ### Regular Checks
 
-| Workflow Name           | File name                | Description                                                                                                            |
-| ----------------------- | ------------------------ | ---------------------------------------------------------------------------------------------------------------------- |
-| `Test example`          | `auto_example_check.yml` | This workflow will test all examples every Sunday                                                                      |
-| `Build on 8 GPUs`       | `build_gpu_8.yml`        | This workflow will run the unit tests everyday with 8 GPUs.                                                            |
-| `Synchronize submodule` | `submodule.yml`          | This workflow will check if any git submodule is updated. If so, it will create a PR to update the submodule pointers. |
-| `Close inactive issues` | `close_inactive.yml`     | This workflow will close issues which are stale for 14 days.                                                           |
+| Workflow Name           | File name                     | Description                                                                                                                                                      |
+| ----------------------- | ----------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `Test example`          | `auto_example_check.yml`      | This workflow will test all examples every Sunday                                                                                                                |
+| `Compatibility Test`    | `auto_compatibility_test.yml` | This workflow will check the compatiblity of Colossal-AI against PyTorch and CUDA every Sunday. The PyTorch and CUDA versions are specified in `.compatibility`. |
+| `Build on 8 GPUs`       | `build_gpu_8.yml`             | This workflow will run the unit tests everyday with 8 GPUs.                                                                                                      |
+| `Synchronize submodule` | `submodule.yml`               | This workflow will check if any git submodule is updated. If so, it will create a PR to update the submodule pointers.                                           |
+| `Close inactive issues` | `close_inactive.yml`          | This workflow will close issues which are stale for 14 days.                                                                                                     |
 
 ### Release
 
-| Workflow Name               | File name                       | Description                                                                                                       |
-| --------------------------- | ------------------------------- | ----------------------------------------------------------------------------------------------------------------- |
-| `Draft GitHub Release Post` | `draft_github_release_post.yml` | Compose a GitHub release post draft based on the commit history. Triggered when `version.txt` is updated.         |
-| `Release to PyPI`           | `release_pypi.yml`              | Build and release the wheel to PyPI. Triggered when `version.txt` is updated.                                     |
-| `Release Nightly to PyPI`   | `release_nightly.yml`           | Build and release the nightly wheel to PyPI as `colossalai-nightly`. Automatically executed every Sunday.         |
-| `Release Docker`            | `release_docker.yml`            | Build and release the Docker image to DockerHub. Triggered when `version.txt` is updated.                         |
-| `Release bdist wheel`       | `release_bdist.yml`             | Build binary wheels with pre-built PyTorch extensions. Manually dispatched. See more details in the next section. |
+| Workflow Name               | File name                       | Description                                                                                                                                           |
+| --------------------------- | ------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `Draft GitHub Release Post` | `draft_github_release_post.yml` | Compose a GitHub release post draft based on the commit history.  Triggered when the change of `version.txt` is merged.                               |
+| `Release to PyPI`           | `release_pypi.yml`              | Build and release the wheel to PyPI.  Triggered when the change of `version.txt` is merged.                                                           |
+| `Release Nightly to PyPI`   | `release_nightly.yml`           | Build and release the nightly wheel to PyPI as `colossalai-nightly`. Automatically executed every Sunday.                                             |
+| `Release Docker`            | `release_docker.yml`            | Build and release the Docker image to DockerHub. Triggered when the change of `version.txt` is merged.                                                |
+| `Release bdist wheel`       | `release_bdist.yml`             | Build binary wheels with pre-built PyTorch extensions. Manually dispatched. See more details in the next section.                                     |
+| `Auto Compatibility Test`   | `auto_compatibility_test.yml`   | Check Colossal-AI's compatiblity against the PyTorch and CUDA version specified in `.compatibility`. Triggered when `version.txt` is changed in a PR. |
 
 ### Manual Dispatch
 
-| Workflow Name           | File name                    | Description                                            |
-| ----------------------- | ---------------------------- | ------------------------------------------------------ |
-| `Release bdist wheel`   | `release_bdist.yml`          | Build binary wheels with pre-built PyTorch extensions. |
-| `Dispatch Example Test` | `dispatch_example_check.yml` | Manually test a specified example.                     |
-| `Compatiblity Test`     | `compatiblity_test.yml`      | Test PyTorch and Python Compatibility.                 |
+| Workflow Name                | File name                        | Description                                            |
+| ---------------------------- | -------------------------------- | ------------------------------------------------------ |
+| `Release bdist wheel`        | `release_bdist.yml`              | Build binary wheels with pre-built PyTorch extensions. |
+| `Dispatch Example Test`      | `dispatch_example_check.yml`     | Manually test a specified example.                     |
+| `Dispatch Compatiblity Test` | `dispatch_compatiblity_test.yml` | Test PyTorch and Python Compatibility.                 |
 
 Refer to this [documentation](https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow) on how to manually trigger a workflow.
 I will provide the details of each workflow below.
@@ -93,6 +96,15 @@ Parameters:
 | ----------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------- |
 | `issue-translate` | `translate_comment.yml` | This workflow is triggered when a new issue comment is created. The comment will be translated into English if not written in English. |
 
+
+## Configuration
+
+This section lists the files used to configure the workflow.
+
+1. `.compatibility`
+
+This `.compatibility` file is to tell GitHub Actions which PyTorch and CUDA versions to test against. Each line in the file is in the format `${torch-version}-${cuda-version}`, which is a tag for Docker image. Thus, this tag must be present in the [docker registry](https://hub.docker.com/r/pytorch/conda-cuda) so as to perform the test.
+
 ## Progress Log
 
 - [x] unit testing
@@ -112,9 +124,9 @@ Parameters:
   - [x] check on PR
   - [x] regular check
   - [x] manual dispatch
-- [ ] compatiblity check
+- [x] compatiblity check
   - [x] manual dispatch
-  - [ ] auto test when release
+  - [x] auto test when release
 - [x] helpers
   - [x] comment translation
   - [x] submodule update
diff --git a/.github/workflows/auto_compatibility_test.yml b/.github/workflows/auto_compatibility_test.yml
new file mode 100644
index 000000000000..4b026c63e7f7
--- /dev/null
+++ b/.github/workflows/auto_compatibility_test.yml
@@ -0,0 +1,74 @@
+name: Compatibility Test
+
+on:
+  pull_request:
+    paths:
+      - 'version.txt'
+      - '.compatibility'
+  # run at 03:00 of every Sunday(singapore time) so here is UTC time Saturday 16:00
+  schedule:
+    - cron:  '0 19 * * 6'
+
+jobs:
+  matrix_preparation:
+    name: Prepare Container List
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v3
+      - id: set-matrix
+        run: |
+          IFS=','
+          DOCKER_IMAGE=()
+
+          while read tag; do
+            DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tag}\"")
+          done <.compatibility
+
+          container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" )
+          container="[${container}]"
+          echo "$container"
+          echo "::set-output name=matrix::{\"container\":$(echo "$container")}"
+
+  build:
+    name: Test for PyTorch Compatibility
+    needs: matrix_preparation
+    if: github.repository == 'hpcaitech/ColossalAI'
+    runs-on: [self-hosted, gpu]
+    strategy:
+      fail-fast: false
+      matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
+    container:
+      image: ${{ matrix.container }}
+      options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
+    timeout-minutes: 120
+    steps:
+      - name: Install dependencies
+        run: |
+          pip install -U pip setuptools wheel --user
+      - uses: actions/checkout@v2
+        with:
+          repository: hpcaitech/TensorNVMe
+          ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
+          path: TensorNVMe
+      - name: Install tensornvme
+        run: |
+          cd TensorNVMe
+          conda install cmake
+          pip install -r requirements.txt
+          pip install -v .
+      - uses: actions/checkout@v2
+        with:
+          ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
+      - name: Install Colossal-AI
+        run: |
+          pip install -v --no-cache-dir .
+          pip install -r requirements/requirements-test.txt
+      - name: Unit Testing
+        run: |
+          PYTHONPATH=$PWD pytest tests
+        env:
+          DATA: /data/scratch/cifar-10
+          NCCL_SHM_DISABLE: 1
+          LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
diff --git a/.github/workflows/compatibility_test.yml b/.github/workflows/dispatch_compatibility_test.yml
similarity index 98%
rename from .github/workflows/compatibility_test.yml
rename to .github/workflows/dispatch_compatibility_test.yml
index eadd07886106..ac5669c6f7f0 100644
--- a/.github/workflows/compatibility_test.yml
+++ b/.github/workflows/dispatch_compatibility_test.yml
@@ -1,4 +1,4 @@
-name: Compatibility Test
+name: Dispatch Compatibility Test
 
 on:
   workflow_dispatch:

From 8221fd7485772d0133cb177ef7f5dbf984d7a76e Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Thu, 12 Jan 2023 09:35:10 +0800
Subject: [PATCH 166/209] [autoparallel] update binary elementwise handler
 (#2451)

* [autoparallel] update binary elementwise handler

* polish
---
 .../binary_elementwise_handler.py             | 27 ++++++--
 .../test_binary_elementwise_handler.py        | 63 ++++++++++++++-----
 .../test_node_handler/utils.py                |  5 +-
 3 files changed, 73 insertions(+), 22 deletions(-)

diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/binary_elementwise_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/binary_elementwise_handler.py
index f510f74776b6..db8f0b54ddee 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/binary_elementwise_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/binary_elementwise_handler.py
@@ -32,20 +32,32 @@ def _get_op_data_type(tensor):
                 return OperationDataType.ARG
 
         def _get_arg_value(idx):
+            non_tensor = False
             if isinstance(self.node.args[idx], Node):
                 meta_data = self.node.args[idx]._meta_data
+                # The meta_data of node type argument could also possibly be a non-tensor object.
+                if not isinstance(meta_data, torch.Tensor):
+                    assert isinstance(meta_data, (int, float))
+                    meta_data = torch.Tensor([meta_data]).to('meta')
+                    non_tensor = True
+
             else:
                 # this is in fact a real data like int 1
                 # but we can deem it as meta data
                 # as it won't affect the strategy generation
                 assert isinstance(self.node.args[idx], (int, float))
                 meta_data = torch.Tensor([self.node.args[idx]]).to('meta')
-            return meta_data
+                non_tensor = True
 
-        input_meta_data = _get_arg_value(0)
-        other_meta_data = _get_arg_value(1)
-        output_meta_data = self.node._meta_data
+            return meta_data, non_tensor
 
+        input_meta_data, non_tensor_input = _get_arg_value(0)
+        other_meta_data, non_tensor_other = _get_arg_value(1)
+        output_meta_data = self.node._meta_data
+        # we need record op_data with non-tensor data in this list,
+        # and filter the non-tensor op_data in post_process.
+        self.non_tensor_list = []
+        # assert False
         input_op_data = OperationData(name=str(self.node.args[0]),
                                       type=_get_op_data_type(input_meta_data),
                                       data=input_meta_data,
@@ -58,6 +70,10 @@ def _get_arg_value(idx):
                                        type=OperationDataType.OUTPUT,
                                        data=output_meta_data,
                                        logical_shape=bcast_shape)
+        if non_tensor_input:
+            self.non_tensor_list.append(input_op_data)
+        if non_tensor_other:
+            self.non_tensor_list.append(other_op_data)
 
         mapping = {'input': input_op_data, 'other': other_op_data, 'output': output_op_data}
         return mapping
@@ -73,9 +89,10 @@ def post_process(self, strategy: ShardingStrategy) -> Union[ShardingStrategy, Li
         op_data_mapping = self.get_operation_data_mapping()
 
         for op_name, op_data in op_data_mapping.items():
-            if not isinstance(op_data.data, torch.Tensor):
+            if op_data in self.non_tensor_list:
                 # remove the sharding spec if the op_data is not a tensor, e.g. torch.pow(tensor, 2)
                 strategy.sharding_specs.pop(op_data)
+
             else:
                 # convert the logical sharding spec to physical sharding spec if broadcast
                 # e.g. torch.rand(4, 4) + torch.rand(4)
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_binary_elementwise_handler.py b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_binary_elementwise_handler.py
index 42430d5a24cb..50385c0450a8 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_binary_elementwise_handler.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_binary_elementwise_handler.py
@@ -122,25 +122,41 @@ def forward(self, x1, x2):
             assert input_sharding_spec.sharding_sequence[-1] == other_sharding_spec.sharding_sequence[-1]
 
 
-def check_binary_elementwise_handler_with_int(rank, op, other_dim, world_size, port):
-    disable_existing_loggers()
-    launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+class BEOpModelWithNodeConst(nn.Module):
 
-    class BinaryElementwiseOpModel(nn.Module):
+    def __init__(self, op):
+        super().__init__()
+        self.op = op
 
-        def __init__(self, op, const):
-            super().__init__()
-            self.op = op
-            self.const = const
+    def forward(self, x1):
+        const = x1.dim()
+        out = self.op(x1, const)
+        return out
 
-        def forward(self, x1):
-            out = self.op(x1, self.const)
-            return out
+
+class BEOpModelWithIntConst(nn.Module):
+
+    def __init__(self, op, const):
+        super().__init__()
+        self.op = op
+        self.const = const
+
+    def forward(self, x1):
+        out = self.op(x1, self.const)
+        return out
+
+
+def check_binary_elementwise_handler_with_int(rank, op, other_dim, model_cls, world_size, port):
+    disable_existing_loggers()
+    launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     physical_mesh_id = torch.arange(0, 4)
     mesh_shape = (2, 2)
     device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-    model = BinaryElementwiseOpModel(op, other_dim).cuda()
+    if model_cls == BEOpModelWithNodeConst:
+        model = model_cls(op).cuda()
+    else:
+        model = model_cls(op, other_dim).cuda()
     x1 = torch.rand(4, 4).cuda()
     # the index of binary-elementwise node in computation graph
     node_index = 1
@@ -159,9 +175,14 @@ def forward(self, x1):
     tracer = ColoTracer()
     meta_args = {'x1': torch.rand(4, 4).to('meta')}
     graph = tracer.trace(model, meta_args=meta_args)
+    print(graph)
+    # assert False
     gm = ColoGraphModule(model, graph)
 
-    op_node = list(graph.nodes)[1]
+    if model_cls == BEOpModelWithNodeConst:
+        op_node = list(graph.nodes)[2]
+    else:
+        op_node = list(graph.nodes)[1]
     strategies_vector = StrategiesVector(op_node)
 
     # build handler
@@ -212,7 +233,7 @@ def forward(self, x1):
 @parameterize('other_dim', [1, 2])
 @pytest.mark.dist
 @rerun_if_address_is_in_use()
-def test_binary_elementwise_handler(op, other_dim):
+def test_binary_elementwise_handler_with_tensor(op, other_dim):
     world_size = 4
     run_func_tensor = partial(check_binary_elementwise_handler_with_tensor,
                               op=op,
@@ -220,8 +241,19 @@ def test_binary_elementwise_handler(op, other_dim):
                               world_size=world_size,
                               port=free_port())
     mp.spawn(run_func_tensor, nprocs=world_size)
+
+
+@run_on_environment_flag(name='AUTO_PARALLEL')
+@parameterize('op', [torch.add])
+@parameterize('other_dim', [1, 2])
+@parameterize('model_cls', [BEOpModelWithNodeConst, BEOpModelWithIntConst])
+@pytest.mark.dist
+@rerun_if_address_is_in_use()
+def test_binary_elementwise_handler_with_int(op, model_cls, other_dim):
+    world_size = 4
     run_func_int = partial(check_binary_elementwise_handler_with_int,
                            op=op,
+                           model_cls=model_cls,
                            other_dim=other_dim,
                            world_size=world_size,
                            port=free_port())
@@ -229,4 +261,5 @@ def test_binary_elementwise_handler(op, other_dim):
 
 
 if __name__ == '__main__':
-    test_binary_elementwise_handler()
+    test_binary_elementwise_handler_with_tensor()
+    test_binary_elementwise_handler_with_int()
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/utils.py b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/utils.py
index d02e1e31eb40..db76ed9b85df 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/utils.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/utils.py
@@ -90,7 +90,8 @@ def numerical_test_for_node_strategy(model: torch.nn.Module,
         solver_options = SolverOptions()
         strategies_constructor = StrategiesConstructor(graph, device_mesh, solver_options)
         strategies_constructor.build_strategies_and_cost()
-        target_node = list(graph.nodes)[node_index]
+        target_node = [strategies_vector.node for strategies_vector in strategies_constructor.leaf_strategies
+                      ][node_index]
         if node_type == 'normal':
             solution_len = len(strategies_constructor.leaf_strategies)
             solution = [0] * solution_len
@@ -112,7 +113,7 @@ def numerical_test_for_node_strategy(model: torch.nn.Module,
             ret = solver.call_solver_serialized_args()
             solution = list(ret[0])
         gm, sharding_spec_dict, origin_spec_dict, comm_actions_dict = runtime_preparation_pass(
-            gm, solution, device_mesh)
+            gm, solution, device_mesh, strategies_constructor)
         gm = runtime_apply_pass(gm)
         gm.recompile()
 

From 32c46e146e96d9f6ee949b9f64b84a789e5479ea Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 12 Jan 2023 10:57:02 +0800
Subject: [PATCH 167/209] [workflow] automated bdist wheel build (#2459)

* [workflow] automated bdist wheel build

* polish workflow

* polish readme

* polish readme
---
 .bdist.json                              | 24 ++++++++
 .github/workflows/README.md              | 32 ++++++++---
 .github/workflows/auto_release_bdist.yml | 70 ++++++++++++++++++++++++
 3 files changed, 118 insertions(+), 8 deletions(-)
 create mode 100644 .bdist.json
 create mode 100644 .github/workflows/auto_release_bdist.yml

diff --git a/.bdist.json b/.bdist.json
new file mode 100644
index 000000000000..8693bca489e8
--- /dev/null
+++ b/.bdist.json
@@ -0,0 +1,24 @@
+{
+  "build": [
+    {
+      "torch_version": "1.11.0",
+      "cuda_image": "hpcaitech/cuda-conda:10.2"
+    },
+    {
+      "torch_version": "1.11.0",
+      "cuda_image": "hpcaitech/cuda-conda:11.3"
+    },
+    {
+      "torch_version": "1.12.1",
+      "cuda_image": "hpcaitech/cuda-conda:10.2"
+    },
+    {
+      "torch_version": "1.12.1",
+      "cuda_image": "hpcaitech/cuda-conda:11.3"
+    },
+    {
+      "torch_version": "1.12.1",
+      "cuda_image": "hpcaitech/cuda-conda:11.6"
+    }
+  ]
+}
diff --git a/.github/workflows/README.md b/.github/workflows/README.md
index bc1f8504df3c..cda6a3139a1b 100644
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@@ -48,14 +48,15 @@ In the section below, we will dive into the details of different workflows avail
 
 ### Release
 
-| Workflow Name               | File name                       | Description                                                                                                                                           |
-| --------------------------- | ------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `Draft GitHub Release Post` | `draft_github_release_post.yml` | Compose a GitHub release post draft based on the commit history.  Triggered when the change of `version.txt` is merged.                               |
-| `Release to PyPI`           | `release_pypi.yml`              | Build and release the wheel to PyPI.  Triggered when the change of `version.txt` is merged.                                                           |
-| `Release Nightly to PyPI`   | `release_nightly.yml`           | Build and release the nightly wheel to PyPI as `colossalai-nightly`. Automatically executed every Sunday.                                             |
-| `Release Docker`            | `release_docker.yml`            | Build and release the Docker image to DockerHub. Triggered when the change of `version.txt` is merged.                                                |
-| `Release bdist wheel`       | `release_bdist.yml`             | Build binary wheels with pre-built PyTorch extensions. Manually dispatched. See more details in the next section.                                     |
-| `Auto Compatibility Test`   | `auto_compatibility_test.yml`   | Check Colossal-AI's compatiblity against the PyTorch and CUDA version specified in `.compatibility`. Triggered when `version.txt` is changed in a PR. |
+| Workflow Name               | File name                       | Description                                                                                                                                                 |
+| --------------------------- | ------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `Draft GitHub Release Post` | `draft_github_release_post.yml` | Compose a GitHub release post draft based on the commit history.  Triggered when the change of `version.txt` is merged.                                     |
+| `Release to PyPI`           | `release_pypi.yml`              | Build and release the wheel to PyPI.  Triggered when the change of `version.txt` is merged.                                                                 |
+| `Release Nightly to PyPI`   | `release_nightly.yml`           | Build and release the nightly wheel to PyPI as `colossalai-nightly`. Automatically executed every Sunday.                                                   |
+| `Release Docker`            | `release_docker.yml`            | Build and release the Docker image to DockerHub. Triggered when the change of `version.txt` is merged.                                                      |
+| `Release bdist wheel`       | `release_bdist.yml`             | Build binary wheels with pre-built PyTorch extensions. Manually dispatched. See more details in the next section.                                           |
+| `Auto Release bdist wheel`  | `auto_release_bdist.yml`        | Build binary wheels with pre-built PyTorch extensions.Triggered when the change of `version.txt` is merged. Build specificatons are stored in `.bdist.json` |
+| `Auto Compatibility Test`   | `auto_compatibility_test.yml`   | Check Colossal-AI's compatiblity against the PyTorch and CUDA version specified in `.compatibility`. Triggered when `version.txt` is changed in a PR.       |
 
 ### Manual Dispatch
 
@@ -105,6 +106,21 @@ This section lists the files used to configure the workflow.
 
 This `.compatibility` file is to tell GitHub Actions which PyTorch and CUDA versions to test against. Each line in the file is in the format `${torch-version}-${cuda-version}`, which is a tag for Docker image. Thus, this tag must be present in the [docker registry](https://hub.docker.com/r/pytorch/conda-cuda) so as to perform the test.
 
+2. `.bdist.json`
+
+This file controls what pytorch/cuda compatible pre-built releases will be built and published. You can add a new entry according to the json schema below if there is a new wheel that needs to be built with AOT compilation of PyTorch extensions.
+
+```json
+{
+  "build": [
+    {
+      "torch_version": "",
+      "cuda_image": ""
+    },
+  ]
+}
+```
+
 ## Progress Log
 
 - [x] unit testing
diff --git a/.github/workflows/auto_release_bdist.yml b/.github/workflows/auto_release_bdist.yml
new file mode 100644
index 000000000000..56a3036f8c94
--- /dev/null
+++ b/.github/workflows/auto_release_bdist.yml
@@ -0,0 +1,70 @@
+name: Auto Release bdist wheel
+
+on:
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - 'version.txt'
+    types:
+      - closed
+
+jobs:
+  matrix_preparation:
+    name: Prepare Container List
+    if: ( github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true ) && github.repository == 'hpcaitech/ColossalAI'
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v3
+      - id: set-matrix
+        run: |
+          bdist=$(cat .bdist.json | tr '\n' ' ')
+          echo "matrix=${bdist}" >> $GITHUB_OUTPUT
+
+  build:
+    name: Release bdist wheels
+    needs: matrix_preparation
+    runs-on: [self-hosted, gpu]
+    strategy:
+      fail-fast: false
+      matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
+    container:
+      image: ${{ matrix.build.cuda_image }}
+      options: --gpus all --rm
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      # cub is for cuda 10.2
+      - name: Copy scripts
+        run: |
+          cp -r ./.github/workflows/scripts/* ./
+
+          # link the cache diretories to current path
+          ln -s /github/home/conda_pkgs ./conda_pkgs
+          ln -s /github/home/pip_wheels ./pip_wheels
+
+          # set the conda package path
+          echo "pkgs_dirs:\n  - $PWD/conda_pkgs" > ~/.condarc
+
+          # set safe directory
+          git config --global --add safe.directory /__w/ColossalAI/ColossalAI
+
+          # get cub package for cuda 10.2
+          wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
+          unzip 1.8.0.zip
+      - name: Build bdist wheel
+        run: |
+          pip install beautifulsoup4 requests packaging
+          python ./build_colossalai_wheel.py --torch_version $TORCH_VERSIONS
+        env:
+          TORCH_VERSIONS: ${{ matrix.build.torch_version }}
+      - name: 🚀 Deploy
+        uses: garygrossgarten/github-action-scp@release
+        with:
+          local: all_dist
+          remote: ${{ secrets.PRIVATE_PYPI_DIR }}
+          host: ${{ secrets.PRIVATE_PYPI_HOST }}
+          username: ${{ secrets.PRIVATE_PYPI_USER }}
+          password: ${{ secrets.PRIVATE_PYPI_PASSWD }}

From 93582629927dd6e413f9d46e0b96801bb14bd1d2 Mon Sep 17 00:00:00 2001
From: Haofan Wang <haofanwang.ai@gmail.com>
Date: Thu, 12 Jan 2023 13:49:01 +0800
Subject: [PATCH 168/209] Fix False warning in initialize.py (#2456)

* Update initialize.py

* pre-commit run check
---
 colossalai/initialize.py | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/colossalai/initialize.py b/colossalai/initialize.py
index e907efddee69..f3719dcb47b3 100644
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@@ -15,26 +15,25 @@
 from torch.optim.optimizer import Optimizer
 from torch.utils.data import DataLoader
 
-from colossalai.core import global_context as gpc
-from colossalai.context.moe_context import MOE_CONTEXT
-
-from colossalai.logging import get_dist_logger
-
-from colossalai.engine.schedule import NonPipelineSchedule, PipelineSchedule, InterleavedPipelineSchedule, get_tensor_shape
-from colossalai.engine import Engine
-from colossalai.gemini.ophooks import BaseOpHook
-
-from colossalai.utils import (get_current_device, is_using_ddp, is_using_pp, is_using_sequence, sync_model_param)
-from colossalai.utils.moe import sync_moe_model_param
-
 from colossalai.amp import AMP_TYPE, convert_to_amp
 from colossalai.amp.naive_amp import NaiveAMPModel
 from colossalai.builder.builder import build_gradient_handler
 from colossalai.context import Config, ConfigException, ParallelMode
+from colossalai.context.moe_context import MOE_CONTEXT
+from colossalai.core import global_context as gpc
+from colossalai.engine import Engine
 from colossalai.engine.gradient_accumulation import accumulate_gradient
-
+from colossalai.engine.schedule import (
+    InterleavedPipelineSchedule,
+    NonPipelineSchedule,
+    PipelineSchedule,
+    get_tensor_shape,
+)
+from colossalai.gemini.ophooks import BaseOpHook
+from colossalai.logging import get_dist_logger
 from colossalai.nn.optimizer.colossalai_optimizer import ColossalaiOptimizer
-
+from colossalai.utils import get_current_device, is_using_ddp, is_using_pp, is_using_sequence, sync_model_param
+from colossalai.utils.moe import sync_moe_model_param
 from colossalai.zero import convert_to_zero_v2
 from colossalai.zero.sharded_optim.sharded_optim_v2 import ShardedOptimizerV2
 
@@ -301,9 +300,9 @@ def initialize(model: nn.Module,
             model = model().to(get_current_device())
 
         # optimizer maybe a optimizer_cls
-        logger.warning("Initializing an non ZeRO model with optimizer class")
         if isinstance(optimizer, Callable):
             optimizer = optimizer(model.parameters())
+            logger.warning("Initializing an non ZeRO model with optimizer class")
 
     if not use_zero:
         if is_using_sequence():

From c20529fe78f52e36df209bd2ab4143609eec7535 Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Thu, 12 Jan 2023 14:30:58 +0800
Subject: [PATCH 169/209] [examples] update autoparallel tutorial demo (#2449)

* [examples] update autoparallel tutorial demo

* add test_ci.sh

* polish

* add conda yaml
---
 .../auto_parallel_with_resnet.py              | 132 +++---------------
 .../tutorial/auto_parallel/environment.yaml   |  32 +++++
 examples/tutorial/auto_parallel/setup.py      |  13 ++
 examples/tutorial/auto_parallel/test_ci.sh    |  11 ++
 4 files changed, 72 insertions(+), 116 deletions(-)
 create mode 100644 examples/tutorial/auto_parallel/environment.yaml
 create mode 100644 examples/tutorial/auto_parallel/setup.py
 create mode 100644 examples/tutorial/auto_parallel/test_ci.sh

diff --git a/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py b/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py
index e4aff13e484a..1f0d720449e5 100644
--- a/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py
+++ b/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py
@@ -4,23 +4,14 @@
 
 import torch
 from titans.utils import barrier_context
-from torch.fx import GraphModule
 from torchvision import transforms
 from torchvision.datasets import CIFAR10
 from torchvision.models import resnet50
 from tqdm import tqdm
 
 import colossalai
-from colossalai.auto_parallel.passes.runtime_apply_pass import runtime_apply_pass
-from colossalai.auto_parallel.passes.runtime_preparation_pass import runtime_preparation_pass
-from colossalai.auto_parallel.tensor_shard.solver.cost_graph import CostGraph
-from colossalai.auto_parallel.tensor_shard.solver.graph_analysis import GraphAnalyser
-from colossalai.auto_parallel.tensor_shard.solver.options import DataloaderOption, SolverOptions
-from colossalai.auto_parallel.tensor_shard.solver.solver import Solver
-from colossalai.auto_parallel.tensor_shard.solver.strategies_constructor import StrategiesConstructor
+from colossalai.auto_parallel.tensor_shard.initialize import autoparallelize
 from colossalai.core import global_context as gpc
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.fx.tracer.tracer import ColoTracer
 from colossalai.logging import get_dist_logger
 from colossalai.nn.lr_scheduler import CosineAnnealingLR
 from colossalai.utils import get_dataloader
@@ -28,12 +19,6 @@
 DATA_ROOT = Path(os.environ.get('DATA', '../data')).absolute()
 
 
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-s', '--synthetic', action="store_true", help="use synthetic dataset instead of CIFAR10")
-    return parser.parse_args()
-
-
 def synthesize_data():
     img = torch.rand(gpc.config.BATCH_SIZE, 3, 32, 32)
     label = torch.randint(low=0, high=10, size=(gpc.config.BATCH_SIZE,))
@@ -41,82 +26,15 @@ def synthesize_data():
 
 
 def main():
-    args = parse_args()
     colossalai.launch_from_torch(config='./config.py')
 
     logger = get_dist_logger()
 
-    if not args.synthetic:
-        with barrier_context():
-            # build dataloaders
-            train_dataset = CIFAR10(root=DATA_ROOT,
-                                    download=True,
-                                    transform=transforms.Compose([
-                                        transforms.RandomCrop(size=32, padding=4),
-                                        transforms.RandomHorizontalFlip(),
-                                        transforms.ToTensor(),
-                                        transforms.Normalize(mean=[0.4914, 0.4822, 0.4465],
-                                                             std=[0.2023, 0.1994, 0.2010]),
-                                    ]))
-
-        test_dataset = CIFAR10(root=DATA_ROOT,
-                               train=False,
-                               transform=transforms.Compose([
-                                   transforms.ToTensor(),
-                                   transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]),
-                               ]))
-
-        train_dataloader = get_dataloader(
-            dataset=train_dataset,
-            add_sampler=True,
-            shuffle=True,
-            batch_size=gpc.config.BATCH_SIZE,
-            pin_memory=True,
-        )
-
-        test_dataloader = get_dataloader(
-            dataset=test_dataset,
-            add_sampler=True,
-            batch_size=gpc.config.BATCH_SIZE,
-            pin_memory=True,
-        )
-    else:
-        train_dataloader, test_dataloader = None, None
-
-    # initialize device mesh
-    physical_mesh_id = torch.arange(0, 4)
-    mesh_shape = (2, 2)
-    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
-
     # trace the model with meta data
-    tracer = ColoTracer()
     model = resnet50(num_classes=10).cuda()
     input_sample = {'x': torch.rand([gpc.config.BATCH_SIZE * torch.distributed.get_world_size(), 3, 32, 32]).to('meta')}
-    graph = tracer.trace(root=model, meta_args=input_sample)
-    gm = GraphModule(model, graph, model.__class__.__name__)
-    gm.recompile()
-
-    # prepare info for solver
-    solver_options = SolverOptions(dataloader_option=DataloaderOption.DISTRIBUTED)
-    strategies_constructor = StrategiesConstructor(graph, device_mesh, solver_options)
-    strategies_constructor.build_strategies_and_cost()
-    cost_graph = CostGraph(strategies_constructor.leaf_strategies)
-    cost_graph.simplify_graph()
-    graph_analyser = GraphAnalyser(gm)
-
-    # solve the solution
-    solver = Solver(gm.graph, strategies_constructor, cost_graph, graph_analyser)
-    ret = solver.call_solver_serialized_args()
-    solution = list(ret[0])
-    if gpc.get_global_rank() == 0:
-        for index, node in enumerate(graph.nodes):
-            print(node.name, node.strategies_vector[solution[index]].name)
-
-    # process the graph for distributed training ability
-    gm, sharding_spec_dict, origin_spec_dict, comm_actions_dict = runtime_preparation_pass(gm, solution, device_mesh)
-    gm = runtime_apply_pass(gm)
-    gm.recompile()
 
+    model = autoparallelize(model, input_sample)
     # build criterion
     criterion = torch.nn.CrossEntropyLoss()
 
@@ -127,65 +45,47 @@ def main():
     lr_scheduler = CosineAnnealingLR(optimizer, total_steps=gpc.config.NUM_EPOCHS)
 
     for epoch in range(gpc.config.NUM_EPOCHS):
-        gm.train()
+        model.train()
 
-        if args.synthetic:
-            # if we use synthetic data
-            # we assume it only has 30 steps per epoch
-            num_steps = range(30)
+        # if we use synthetic data
+        # we assume it only has 30 steps per epoch
+        num_steps = range(30)
 
-        else:
-            # we use the actual number of steps for training
-            num_steps = range(len(train_dataloader))
-            data_iter = iter(train_dataloader)
         progress = tqdm(num_steps)
 
         for _ in progress:
-            if args.synthetic:
-                # generate fake data
-                img, label = synthesize_data()
-            else:
-                # get the real data
-                img, label = next(data_iter)
+            # generate fake data
+            img, label = synthesize_data()
 
             img = img.cuda()
             label = label.cuda()
             optimizer.zero_grad()
-            output = gm(img, sharding_spec_dict, origin_spec_dict, comm_actions_dict)
+            output = model(img)
             train_loss = criterion(output, label)
             train_loss.backward(train_loss)
             optimizer.step()
         lr_scheduler.step()
 
         # run evaluation
-        gm.eval()
+        model.eval()
         correct = 0
         total = 0
 
-        if args.synthetic:
-            # if we use synthetic data
-            # we assume it only has 10 steps for evaluation
-            num_steps = range(30)
+        # if we use synthetic data
+        # we assume it only has 10 steps for evaluation
+        num_steps = range(30)
 
-        else:
-            # we use the actual number of steps for training
-            num_steps = range(len(test_dataloader))
-            data_iter = iter(test_dataloader)
         progress = tqdm(num_steps)
 
         for _ in progress:
-            if args.synthetic:
-                # generate fake data
-                img, label = synthesize_data()
-            else:
-                # get the real data
-                img, label = next(data_iter)
+            # generate fake data
+            img, label = synthesize_data()
 
             img = img.cuda()
             label = label.cuda()
 
             with torch.no_grad():
-                output = gm(img, sharding_spec_dict, origin_spec_dict, comm_actions_dict)
+                output = model(img)
                 test_loss = criterion(output, label)
             pred = torch.argmax(output, dim=-1)
             correct += torch.sum(pred == label)
diff --git a/examples/tutorial/auto_parallel/environment.yaml b/examples/tutorial/auto_parallel/environment.yaml
new file mode 100644
index 000000000000..5b811631a19f
--- /dev/null
+++ b/examples/tutorial/auto_parallel/environment.yaml
@@ -0,0 +1,32 @@
+name: auto
+channels:
+  - pytorch
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=2_kmp_llvm
+  - blas=1.0=mkl
+  - brotlipy=0.7.0=py38h27cfd23_1003
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2022.12.7=ha878542_0
+  - certifi=2022.12.7=pyhd8ed1ab_0
+  - cffi=1.15.1=py38h74dc2b5_0
+  - charset-normalizer=2.0.4=pyhd3eb1b0_0
+  - coin-or-cbc=2.10.8=h3786ebc_0
+  - coin-or-cgl=0.60.6=h6f57e76_2
+  - coin-or-clp=1.17.7=hc56784d_2
+  - coin-or-osi=0.108.7=h2720bb7_2
+  - coin-or-utils=2.11.6=h202d8b1_2
+  - python=3.8.13
+  - pip=22.2.2
+  - cudatoolkit=11.3
+  - pytorch=1.12.1
+  - torchvision=0.13.1
+  - numpy=1.23.1
+  - pip:
+    - titans
+    - torch==1.12.1
+    - pulp==2.7.0
+    - datasets
+    - colossalai
diff --git a/examples/tutorial/auto_parallel/setup.py b/examples/tutorial/auto_parallel/setup.py
new file mode 100644
index 000000000000..6e6cff32ed23
--- /dev/null
+++ b/examples/tutorial/auto_parallel/setup.py
@@ -0,0 +1,13 @@
+from setuptools import find_packages, setup
+
+setup(
+    name='auto_parallel',
+    version='0.0.1',
+    description='',
+    packages=find_packages(),
+    install_requires=[
+        'torch',
+        'numpy',
+        'tqdm',
+    ],
+)
diff --git a/examples/tutorial/auto_parallel/test_ci.sh b/examples/tutorial/auto_parallel/test_ci.sh
new file mode 100644
index 000000000000..74332548f623
--- /dev/null
+++ b/examples/tutorial/auto_parallel/test_ci.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -euxo pipefail
+
+conda init bash
+conda env create -f environment.yaml
+conda activate auto
+cd ../../..
+pip uninstall colossalai
+pip install -v .
+cd ./examples/tutorial/auto_parallel
+colossalai run --nproc_per_node 4 auto_parallel_with_resnet.py -s

From 14d929936065fae2ebd99bc5d4ab32d8de7db11e Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 12 Jan 2023 14:52:09 +0800
Subject: [PATCH 170/209] [cli] fixed hostname mismatch error (#2465)

---
 colossalai/cli/launcher/hostinfo.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/colossalai/cli/launcher/hostinfo.py b/colossalai/cli/launcher/hostinfo.py
index 2f0830c5880d..065cbc37101f 100644
--- a/colossalai/cli/launcher/hostinfo.py
+++ b/colossalai/cli/launcher/hostinfo.py
@@ -1,5 +1,5 @@
-from typing import List
 import socket
+from typing import List
 
 
 class HostInfo:
@@ -35,9 +35,14 @@ def is_host_localhost(hostname: str, port: str = None) -> None:
 
         if port is None:
             port = 22    # no port specified, lets just use the ssh port
-        hostname = socket.getfqdn(hostname)
+
+        # socket.getfqdn("127.0.0.1") does not return localhost
+        # on some users' machines
+        # thus, we directly return True if hostname is locahost, 127.0.0.1 or 0.0.0.0
         if hostname in ("localhost", "127.0.0.1", "0.0.0.0"):
             return True
+
+        hostname = socket.getfqdn(hostname)
         localhost = socket.gethostname()
         localaddrs = socket.getaddrinfo(localhost, port)
         targetaddrs = socket.getaddrinfo(hostname, port)

From e6943e2d11fbe7dd69c694e435f598cd140b1574 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 12 Jan 2023 16:26:42 +0800
Subject: [PATCH 171/209] [example] integrate autoparallel demo with CI (#2466)

* [example] integrate autoparallel demo with CI

* polish code

* polish code

* polish code

* polish code
---
 examples/tutorial/auto_parallel/README.md     | 95 ++++++-------------
 .../auto_parallel_with_resnet.py              | 18 +---
 examples/tutorial/auto_parallel/config.py     |  4 +-
 .../tutorial/auto_parallel/environment.yaml   | 32 -------
 .../tutorial/auto_parallel/requirements.txt   |  9 +-
 examples/tutorial/auto_parallel/test_ci.sh    | 11 +--
 6 files changed, 43 insertions(+), 126 deletions(-)
 delete mode 100644 examples/tutorial/auto_parallel/environment.yaml

diff --git a/examples/tutorial/auto_parallel/README.md b/examples/tutorial/auto_parallel/README.md
index e99a018c2da1..bb014b9067b2 100644
--- a/examples/tutorial/auto_parallel/README.md
+++ b/examples/tutorial/auto_parallel/README.md
@@ -1,73 +1,52 @@
-# Auto-Parallelism with ResNet
+# Auto-Parallelism
 
-## 🚀Quick Start
-### Auto-Parallel Tutorial
-1. Install `pulp` and `coin-or-cbc` for the solver.
-```bash
-pip install pulp
-conda install -c conda-forge coin-or-cbc
-```
-2. Run the auto parallel resnet example with 4 GPUs with synthetic dataset.
-```bash
-colossalai run --nproc_per_node 4 auto_parallel_with_resnet.py -s
-```
+## Table of contents
 
-You should expect to the log like this. This log shows the edge cost on the computation graph as well as the sharding strategy for an operation. For example, `layer1_0_conv1 S01R = S01R X RR` means that the first dimension (batch) of the input and output is sharded while the weight is not sharded (S means sharded, R means replicated), simply equivalent to data parallel training.
-![](https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/tutorial/auto-parallel%20demo.png)
+- [Auto-Parallelism](#auto-parallelism)
+  - [Table of contents](#table-of-contents)
+  - [📚 Overview](#-overview)
+  - [🚀 Quick Start](#-quick-start)
+    - [Setup](#setup)
+    - [Auto-Parallel Tutorial](#auto-parallel-tutorial)
+    - [Auto-Checkpoint Tutorial](#auto-checkpoint-tutorial)
 
 
-### Auto-Checkpoint Tutorial
-1. Stay in the `auto_parallel` folder.
-2. Install the dependencies.
-```bash
-pip install matplotlib transformers
-```
-3. Run a simple resnet50 benchmark to automatically checkpoint the model.
-```bash
-python auto_ckpt_solver_test.py --model resnet50
-```
+## 📚 Overview
 
-You should expect the log to be like this
-![](https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/tutorial/auto-ckpt%20demo.png)
+This tutorial folder contains a simple demo to run auto-parallelism with ResNet. Meanwhile, this diretory also contains demo scripts to run automatic activation checkpointing, but both features are still experimental for now and no guarantee that they will work for your version of Colossal-AI.
 
-This shows that given different memory budgets, the model is automatically injected with activation checkpoint and its time taken per iteration. You can run this benchmark for GPT as well but it can much longer since the model is larger.
-```bash
-python auto_ckpt_solver_test.py --model gpt2
-```
+## 🚀 Quick Start
 
-4. Run a simple benchmark to find the optimal batch size for checkpointed model.
-```bash
-python auto_ckpt_batchsize_test.py
-```
+### Setup
 
-You can expect the log to be like
-![](https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/tutorial/auto-ckpt%20batchsize.png)
-
-
-## Prepare Dataset
-
-We use CIFAR10 dataset in this example. You should invoke the `donwload_cifar10.py` in the tutorial root directory or directly run the `auto_parallel_with_resnet.py`.
-The dataset will be downloaded to `colossalai/examples/tutorials/data` by default.
-If you wish to use customized directory for the dataset. You can set the environment variable `DATA` via the following command.
+1. Create a conda environment
 
 ```bash
-export DATA=/path/to/data
+conda create -n auto python=3.8
+conda activate auto
 ```
 
-## extra requirements to use autoparallel
+2. Install `requirements` and `coin-or-cbc` for the solver.
 
 ```bash
-pip install pulp
-conda install coin-or-cbc
+pip install -r requirements.txt
+conda install -c conda-forge coin-or-cbc
 ```
 
-## Run on 2*2 device mesh
+
+### Auto-Parallel Tutorial
+
+Run the auto parallel resnet example with 4 GPUs with synthetic dataset.
 
 ```bash
 colossalai run --nproc_per_node 4 auto_parallel_with_resnet.py
 ```
 
-## Auto Checkpoint Benchmarking
+You should expect to the log like this. This log shows the edge cost on the computation graph as well as the sharding strategy for an operation. For example, `layer1_0_conv1 S01R = S01R X RR` means that the first dimension (batch) of the input and output is sharded while the weight is not sharded (S means sharded, R means replicated), simply equivalent to data parallel training.
+![](https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/tutorial/auto-parallel%20demo.png)
+
+
+### Auto-Checkpoint Tutorial
 
 We prepare two bechmarks for you to test the performance of auto checkpoint
 
@@ -86,21 +65,3 @@ python auto_ckpt_solver_test.py --model resnet50
 # tun auto_ckpt_batchsize_test.py
 python auto_ckpt_batchsize_test.py
 ```
-
-There are some results for your reference
-
-## Auto Checkpoint Solver Test
-
-### ResNet 50
-![](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/tutorial/resnet50_benchmark.png)
-
-### GPT2 Medium
-![](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/tutorial/gpt2_benchmark.png)
-
-## Auto Checkpoint Batch Size Test
-```bash
-===============test summary================
-batch_size: 512, peak memory: 73314.392 MB, through put: 254.286 images/s
-batch_size: 1024, peak memory: 73316.216 MB, through put: 397.608 images/s
-batch_size: 2048, peak memory: 72927.837 MB, through put: 277.429 images/s
-```
diff --git a/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py b/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py
index 1f0d720449e5..15429f19cbcf 100644
--- a/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py
+++ b/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py
@@ -1,11 +1,4 @@
-import argparse
-import os
-from pathlib import Path
-
 import torch
-from titans.utils import barrier_context
-from torchvision import transforms
-from torchvision.datasets import CIFAR10
 from torchvision.models import resnet50
 from tqdm import tqdm
 
@@ -14,9 +7,6 @@
 from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
 from colossalai.nn.lr_scheduler import CosineAnnealingLR
-from colossalai.utils import get_dataloader
-
-DATA_ROOT = Path(os.environ.get('DATA', '../data')).absolute()
 
 
 def synthesize_data():
@@ -48,9 +38,8 @@ def main():
         model.train()
 
         # if we use synthetic data
-        # we assume it only has 30 steps per epoch
-        num_steps = range(30)
-
+        # we assume it only has 10 steps per epoch
+        num_steps = range(10)
         progress = tqdm(num_steps)
 
         for _ in progress:
@@ -73,8 +62,7 @@ def main():
 
         # if we use synthetic data
         # we assume it only has 10 steps for evaluation
-        num_steps = range(30)
-
+        num_steps = range(10)
         progress = tqdm(num_steps)
 
         for _ in progress:
diff --git a/examples/tutorial/auto_parallel/config.py b/examples/tutorial/auto_parallel/config.py
index fa14eda740f7..52e0abcef698 100644
--- a/examples/tutorial/auto_parallel/config.py
+++ b/examples/tutorial/auto_parallel/config.py
@@ -1,2 +1,2 @@
-BATCH_SIZE = 128
-NUM_EPOCHS = 10
+BATCH_SIZE = 32
+NUM_EPOCHS = 2
diff --git a/examples/tutorial/auto_parallel/environment.yaml b/examples/tutorial/auto_parallel/environment.yaml
deleted file mode 100644
index 5b811631a19f..000000000000
--- a/examples/tutorial/auto_parallel/environment.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-name: auto
-channels:
-  - pytorch
-  - conda-forge
-  - defaults
-dependencies:
-  - _libgcc_mutex=0.1=conda_forge
-  - _openmp_mutex=4.5=2_kmp_llvm
-  - blas=1.0=mkl
-  - brotlipy=0.7.0=py38h27cfd23_1003
-  - bzip2=1.0.8=h7b6447c_0
-  - ca-certificates=2022.12.7=ha878542_0
-  - certifi=2022.12.7=pyhd8ed1ab_0
-  - cffi=1.15.1=py38h74dc2b5_0
-  - charset-normalizer=2.0.4=pyhd3eb1b0_0
-  - coin-or-cbc=2.10.8=h3786ebc_0
-  - coin-or-cgl=0.60.6=h6f57e76_2
-  - coin-or-clp=1.17.7=hc56784d_2
-  - coin-or-osi=0.108.7=h2720bb7_2
-  - coin-or-utils=2.11.6=h202d8b1_2
-  - python=3.8.13
-  - pip=22.2.2
-  - cudatoolkit=11.3
-  - pytorch=1.12.1
-  - torchvision=0.13.1
-  - numpy=1.23.1
-  - pip:
-    - titans
-    - torch==1.12.1
-    - pulp==2.7.0
-    - datasets
-    - colossalai
diff --git a/examples/tutorial/auto_parallel/requirements.txt b/examples/tutorial/auto_parallel/requirements.txt
index 137a69e80498..ce89e7c80070 100644
--- a/examples/tutorial/auto_parallel/requirements.txt
+++ b/examples/tutorial/auto_parallel/requirements.txt
@@ -1,2 +1,7 @@
-colossalai >= 0.1.12
-torch >= 1.8.1
+torch
+colossalai
+titans
+pulp
+datasets
+matplotlib
+transformers
diff --git a/examples/tutorial/auto_parallel/test_ci.sh b/examples/tutorial/auto_parallel/test_ci.sh
index 74332548f623..bf6275b673ff 100644
--- a/examples/tutorial/auto_parallel/test_ci.sh
+++ b/examples/tutorial/auto_parallel/test_ci.sh
@@ -1,11 +1,6 @@
 #!/bin/bash
 set -euxo pipefail
 
-conda init bash
-conda env create -f environment.yaml
-conda activate auto
-cd ../../..
-pip uninstall colossalai
-pip install -v .
-cd ./examples/tutorial/auto_parallel
-colossalai run --nproc_per_node 4 auto_parallel_with_resnet.py -s
+pip install -r requirements.txt
+conda install -c conda-forge coin-or-cbc
+colossalai run --nproc_per_node 4 auto_parallel_with_resnet.py

From 867c8c2d3a90bbf55a5bedba80a3aeabe0299d0f Mon Sep 17 00:00:00 2001
From: Jiarui Fang <fangjiarui123@gmail.com>
Date: Fri, 13 Jan 2023 10:05:58 +0800
Subject: [PATCH 172/209] [zero] low level optim supports ProcessGroup (#2464)

---
 colossalai/zero/sharded_optim/_utils.py       | 25 +++++--
 .../sharded_optim/bookkeeping/base_store.py   | 13 +++-
 .../sharded_optim/bookkeeping/bucket_store.py |  9 +--
 .../bookkeeping/parameter_store.py            |  8 ++-
 .../zero/sharded_optim/low_level_optim.py     | 69 +++++++++++--------
 .../language/gpt/gemini/train_gpt_demo.py     | 17 +++--
 .../test_zero/low_level_zero/test_grad_acc.py | 13 +++-
 .../test_zero/low_level_zero/test_zero1_2.py  |  6 ++
 8 files changed, 107 insertions(+), 53 deletions(-)

diff --git a/colossalai/zero/sharded_optim/_utils.py b/colossalai/zero/sharded_optim/_utils.py
index 9a839a5705c3..7369f8a2edde 100644
--- a/colossalai/zero/sharded_optim/_utils.py
+++ b/colossalai/zero/sharded_optim/_utils.py
@@ -1,4 +1,5 @@
 import math
+from typing import Optional
 
 import torch
 import torch.distributed as dist
@@ -7,6 +8,7 @@
 
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.tensor import ProcessGroup
 from colossalai.utils import is_model_parallel_parameter
 
 
@@ -101,7 +103,7 @@ def split_half_float_double(tensor_list):
     return buckets
 
 
-def reduce_tensor(tensor, dtype=None, dst_rank=None, parallel_mode=ParallelMode.DATA):
+def reduce_tensor_dp_group(tensor, dtype=None, dst_rank=None, pg: Optional[ProcessGroup] = None):
     """
     Reduce the tensor in the data parallel process group
 
@@ -114,7 +116,7 @@ def reduce_tensor(tensor, dtype=None, dst_rank=None, parallel_mode=ParallelMode.
     :type tensor: torch.Tensor
     :type dtype: torch.dtype, optional
     :type dst_rank: int, optional
-    :type parallel_mode: ParallelMode, optional
+    :type pg: ProcessGroup, optional
     """
     # use the original dtype
     if dtype is None:
@@ -126,8 +128,13 @@ def reduce_tensor(tensor, dtype=None, dst_rank=None, parallel_mode=ParallelMode.
     else:
         tensor_to_reduce = tensor
 
-    world_size = gpc.get_world_size(parallel_mode)
-    group = gpc.get_group(parallel_mode)
+    if isinstance(pg, ProcessGroup):
+        group = pg.dp_process_group()
+        world_size = pg.dp_world_size()
+    else:
+        world_size = gpc.get_world_size(ParallelMode.DATA)
+        group = gpc.get_group(ParallelMode.DATA)
+
     tensor_to_reduce.div_(world_size)
 
     # if rank is None, all reduce will be used
@@ -137,13 +144,19 @@ def reduce_tensor(tensor, dtype=None, dst_rank=None, parallel_mode=ParallelMode.
     if use_all_reduce:
         dist.all_reduce(tensor_to_reduce, group=group)
     else:
-        ranks_in_group = gpc.get_ranks_in_group(parallel_mode)
+        if pg is not None:
+            ranks_in_group = pg.dp_rank_list()
+        else:
+            ranks_in_group = gpc.get_ranks_in_group(ParallelMode.DATA)
         global_rank = ranks_in_group[dst_rank]
         dist.reduce(tensor=tensor_to_reduce, dst=global_rank, group=group)
 
     # recover the original dtype
     if tensor.dtype != dtype and tensor is not tensor_to_reduce:
-        local_rank = gpc.get_local_rank(parallel_mode)
+        if pg is not None:
+            local_rank = pg.dp_local_rank()
+        else:
+            local_rank = gpc.get_local_rank(ParallelMode.DATA)
         if use_all_reduce or dst_rank == local_rank:
             tensor.copy_(tensor_to_reduce)
 
diff --git a/colossalai/zero/sharded_optim/bookkeeping/base_store.py b/colossalai/zero/sharded_optim/bookkeeping/base_store.py
index d4436acaa4bf..3623ed1f048c 100644
--- a/colossalai/zero/sharded_optim/bookkeeping/base_store.py
+++ b/colossalai/zero/sharded_optim/bookkeeping/base_store.py
@@ -1,12 +1,19 @@
+from typing import Optional
+
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.tensor import ProcessGroup
 
 
 class BaseStore:
 
-    def __init__(self, dp_parallel_mode=ParallelMode.DATA):
-        self._world_size = gpc.get_world_size(dp_parallel_mode)
-        self._local_rank = gpc.get_local_rank(dp_parallel_mode)
+    def __init__(self, pg: Optional[ProcessGroup] = None):
+        if isinstance(pg, ProcessGroup):
+            self._world_size = pg.dp_world_size()
+            self._local_rank = pg.dp_local_rank()
+        else:
+            self._world_size = gpc.get_world_size(ParallelMode.DATA)
+            self._local_rank = gpc.get_local_rank(ParallelMode.DATA)
 
     @property
     def world_size(self):
diff --git a/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py b/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py
index 0f2b1bb88b58..aba61624e46e 100644
--- a/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py
+++ b/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py
@@ -1,13 +1,14 @@
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from typing import Optional
+
+from colossalai.tensor import ProcessGroup
 
 from .base_store import BaseStore
 
 
 class BucketStore(BaseStore):
 
-    def __init__(self, dp_parallel_mode):
-        super().__init__(dp_parallel_mode)
+    def __init__(self, pg: Optional[ProcessGroup] = None):
+        super().__init__(pg)
         self._grads = dict()
         self._params = dict()
         self._num_elements_in_bucket = dict()
diff --git a/colossalai/zero/sharded_optim/bookkeeping/parameter_store.py b/colossalai/zero/sharded_optim/bookkeeping/parameter_store.py
index 09ebaaf9938c..c22186abee0f 100644
--- a/colossalai/zero/sharded_optim/bookkeeping/parameter_store.py
+++ b/colossalai/zero/sharded_optim/bookkeeping/parameter_store.py
@@ -1,14 +1,16 @@
-from typing import List
+from typing import List, Optional
 
 from torch import Tensor
 
+from colossalai.tensor import ProcessGroup
+
 from .base_store import BaseStore
 
 
 class ParameterStore(BaseStore):
 
-    def __init__(self, dp_paralle_mode):
-        super().__init__(dp_paralle_mode)
+    def __init__(self, pg: Optional[ProcessGroup] = None):
+        super().__init__(pg)
         # param partitioning data structures
         self._fp16_param_to_rank = dict()
         self._rank_groupid_to_fp16_param_list = dict()
diff --git a/colossalai/zero/sharded_optim/low_level_optim.py b/colossalai/zero/sharded_optim/low_level_optim.py
index c437ac54939c..e372eaa50be4 100644
--- a/colossalai/zero/sharded_optim/low_level_optim.py
+++ b/colossalai/zero/sharded_optim/low_level_optim.py
@@ -1,5 +1,5 @@
 from functools import partial
-from itertools import groupby
+from typing import Optional
 
 import torch
 import torch.distributed as dist
@@ -10,6 +10,7 @@
 from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
 from colossalai.nn.optimizer import ColossalaiOptimizer
+from colossalai.tensor import ProcessGroup
 from colossalai.utils.cuda import get_current_device
 
 from ._utils import (
@@ -18,7 +19,7 @@
     flatten,
     get_grad_accumulate_object,
     has_inf_or_nan,
-    reduce_tensor,
+    reduce_tensor_dp_group,
     release_param_grad,
     split_half_float_double,
     sync_param,
@@ -33,7 +34,7 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
     def __init__(
             self,
             optimizer: Optimizer,
-
+            pg: Optional[ProcessGroup] = None,
     # grad scaler config
             initial_scale=2**16,
             min_scale=1,
@@ -54,9 +55,6 @@ def __init__(
 
     # stage 2
             partition_grad=False,
-            dp_parallel_mode=ParallelMode.DATA,
-            mp_parallel_mode=ParallelMode.MODEL,
-
     # cpu offload
             cpu_offload=False,
 
@@ -76,21 +74,33 @@ def __init__(
         # stage 2
         self._partition_grads = partition_grad
 
-        # cpu_offload
         self._cpu_offload = cpu_offload
 
-        # get process groups
-        self._dp_parallel_mode = dp_parallel_mode
-        self._mp_parallel_mode = mp_parallel_mode
-        self._local_rank = gpc.get_local_rank(dp_parallel_mode)
-        self._world_size = gpc.get_world_size(dp_parallel_mode)
-
-        self._dp_group = gpc.get_group(dp_parallel_mode)
-        if gpc.is_initialized(mp_parallel_mode) and gpc.get_world_size(mp_parallel_mode) > 1:
-            self._mp_group = gpc.get_group(mp_parallel_mode)
+        self._pg = pg
+        if isinstance(pg, ProcessGroup):
+            self._local_rank = pg.dp_local_rank()
+            self._world_size = pg.dp_world_size()
+            self._dp_group = pg.dp_process_group()
+            if pg.tp_world_size() > 1:
+                self._mp_group = pg.tp_process_group()
+            else:
+                self._mp_group = None
+        elif pg is None:
+            dp_parallel_mode = ParallelMode.DATA
+            mp_parallel_mode = ParallelMode.MODEL
+
+            self._dp_parallel_mode = dp_parallel_mode
+            self._mp_parallel_mode = mp_parallel_mode
+            self._local_rank = gpc.get_local_rank(dp_parallel_mode)
+            self._world_size = gpc.get_world_size(dp_parallel_mode)
+
+            self._dp_group = gpc.get_group(dp_parallel_mode)
+            if gpc.is_initialized(mp_parallel_mode) and gpc.get_world_size(mp_parallel_mode) > 1:
+                self._mp_group = gpc.get_group(mp_parallel_mode)
+            else:
+                self._mp_group = None
         else:
-            self._mp_group = None
-
+            raise TypeError(f"pg should be None or a ProcesGroup")
         # fp16 and fp32 params for mixed precision training
         self._fp16_param_groups = dict()
         self._fp32_flat_param_groups_of_current_rank = dict()
@@ -126,9 +136,14 @@ def __init__(
 
         # ParameterStore will manage the tensor buffers used for zero
         # it will not manage the tensors used by mixed precision training
-        self._param_store = ParameterStore(self._dp_parallel_mode)
-        self._grad_store = GradientStore(self._dp_parallel_mode)
-        self._bucket_store = BucketStore(self._dp_parallel_mode)
+        if self._pg is not None:
+            self._param_store = ParameterStore(self._pg)
+            self._grad_store = GradientStore(self._pg)
+            self._bucket_store = BucketStore(self._pg)
+        else:
+            self._param_store = ParameterStore(self._dp_parallel_mode)
+            self._grad_store = GradientStore(self._dp_parallel_mode)
+            self._bucket_store = BucketStore(self._dp_parallel_mode)
 
         # iterate over the param group in the optimizer
         # partition these param groups for data parallel training
@@ -223,9 +238,7 @@ def _partition_param_list(self, param_list):
             numel_per_rank[rank_to_go] += param.numel()
 
         if self._verbose:
-            self._logger.info(f'Number of elements on ranks: {numel_per_rank}',
-                              ranks=[0],
-                              parallel_mode=self._dp_parallel_mode)
+            self._logger.info(f'Number of elements on ranks: {numel_per_rank}', ranks=[0])
         return params_per_rank
 
     def _sanity_checks(self):
@@ -371,10 +384,10 @@ def _reduce_and_copy(self, bucket: TensorBucket, reduce_rank):
 
         with torch.cuda.stream(stream):
             flat = bucket.flatten()
-            reduced_flat = reduce_tensor(tensor=flat,
-                                         dtype=self._communication_dtype,
-                                         dst_rank=reduce_rank,
-                                         parallel_mode=self._dp_parallel_mode)
+            reduced_flat = reduce_tensor_dp_group(tensor=flat,
+                                                  dtype=self._communication_dtype,
+                                                  dst_rank=reduce_rank,
+                                                  pg=self._pg)
 
             # update the reduced tensor
             if reduce_rank is None or reduce_rank == self._local_rank:
diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py
index 92cb7393c37b..7bec980f95bd 100644
--- a/examples/language/gpt/gemini/train_gpt_demo.py
+++ b/examples/language/gpt/gemini/train_gpt_demo.py
@@ -290,14 +290,19 @@ def main():
             from torch.distributed.optim import ZeroRedundancyOptimizer
             optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=torch.optim.Adam, lr=0.01)
     elif args.distplan.startswith("zero"):
+        pg = ProcessGroup()
         model = model.half()
-        partition_flag = args.distplan == "zero2"
+        partition_flag = (args.distplan == "zero2")
         optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
-        optimizer = LowLevelZeroOptimizer(optimizer,
-                                          reduce_bucket_size=12 * 1024 * 1024,
-                                          overlap_communication=True,
-                                          partition_grad=partition_flag,
-                                          verbose=True)
+
+        optimizer = LowLevelZeroOptimizer(
+            optimizer,
+            pg=pg,
+            reduce_bucket_size=12 * 1024 * 1024,
+            overlap_communication=True,
+            partition_grad=partition_flag,
+            verbose=True,
+        )
 
     # model is shared after TP
     numel = get_model_size(model)
diff --git a/tests/test_zero/low_level_zero/test_grad_acc.py b/tests/test_zero/low_level_zero/test_grad_acc.py
index c23b3a3e8fd8..a0d1ac531485 100644
--- a/tests/test_zero/low_level_zero/test_grad_acc.py
+++ b/tests/test_zero/low_level_zero/test_grad_acc.py
@@ -9,6 +9,7 @@
 from torch.testing import assert_close
 
 import colossalai
+from colossalai.tensor import ProcessGroup
 from colossalai.testing.random import seed_all
 from colossalai.utils import free_port
 from colossalai.zero import LowLevelZeroOptimizer
@@ -34,16 +35,18 @@ def exam_zero_1_2_grad_acc():
     # create model
     zero1_model = TestModel().cuda()
     zero2_model = copy.deepcopy(zero1_model)
-
+    pg = ProcessGroup()
     # create optimizer
     zero1_optimizer = torch.optim.Adam(zero1_model.parameters(), lr=1)
     zero2_optimizer = torch.optim.Adam(zero2_model.parameters(), lr=1)
     zero1_optimizer = LowLevelZeroOptimizer(zero1_optimizer,
+                                            pg=pg,
                                             overlap_communication=True,
                                             initial_scale=32,
                                             clip_grad_norm=1.0,
                                             verbose=True)
     zero2_optimizer = LowLevelZeroOptimizer(zero2_optimizer,
+                                            pg=pg,
                                             overlap_communication=True,
                                             partition_grad=True,
                                             initial_scale=32,
@@ -83,7 +86,7 @@ def fwd_bwd_func(number, cur_data):
         assert torch.equal(z1p.data, z2p.data)
 
 
-def exam_zero_1_grad_acc():
+def exam_zero_1_grad_acc(use_pg=True):
     local_rank = torch.distributed.get_rank()
     grad_scale = 32
     seed_all(2008)
@@ -92,6 +95,7 @@ def exam_zero_1_grad_acc():
     zero_model = TestModel()
     torch_model = copy.deepcopy(zero_model)
 
+    seed_all(2008)
     zero_model = zero_model.cuda()
     torch_model = DDP(torch_model.cuda(), bucket_cap_mb=0)
 
@@ -101,7 +105,9 @@ def exam_zero_1_grad_acc():
     # we only test stage 1 here
     # in `check_sharded_param_consistency.py`, we will test whether
     # level 1 and 2 will produce exactly the same results
+    pg = ProcessGroup() if use_pg else None    #ProcessGroup()
     zero_optimizer = LowLevelZeroOptimizer(zero_optimizer,
+                                           pg=pg,
                                            overlap_communication=False,
                                            initial_scale=grad_scale,
                                            reduce_bucket_size=262144,
@@ -152,7 +158,8 @@ def fwd_bwd_func(number, cur_data, check_flag):
 def run_dist(rank, world_size, port):
     colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
 
-    exam_zero_1_grad_acc()
+    exam_zero_1_grad_acc(True)
+    exam_zero_1_grad_acc(False)
     # exam_zero_1_2_grad_acc()
 
 
diff --git a/tests/test_zero/low_level_zero/test_zero1_2.py b/tests/test_zero/low_level_zero/test_zero1_2.py
index b02d3a6a4486..6924827fe4b4 100644
--- a/tests/test_zero/low_level_zero/test_zero1_2.py
+++ b/tests/test_zero/low_level_zero/test_zero1_2.py
@@ -9,6 +9,7 @@
 from torch.testing import assert_close
 
 import colossalai
+from colossalai.tensor import ProcessGroup
 from colossalai.testing.random import seed_all
 from colossalai.utils import free_port
 from colossalai.zero import LowLevelZeroOptimizer
@@ -58,14 +59,17 @@ def exam_zero_1_2():
     zero1_model = TestModel().cuda()
     zero2_model = copy.deepcopy(zero1_model)
 
+    pg = ProcessGroup()
     # create optimizer
     zero1_optimizer = torch.optim.Adam(zero1_model.parameters(), lr=1)
     zero2_optimizer = torch.optim.Adam(zero2_model.parameters(), lr=1)
     zero1_optimizer = LowLevelZeroOptimizer(zero1_optimizer,
+                                            pg=pg,
                                             overlap_communication=True,
                                             initial_scale=128,
                                             verbose=True)
     zero2_optimizer = LowLevelZeroOptimizer(zero2_optimizer,
+                                            pg=pg,
                                             overlap_communication=True,
                                             partition_grad=True,
                                             initial_scale=128)
@@ -127,7 +131,9 @@ def exam_zero_1_torch_ddp():
     # we only test stage 1 here
     # in `check_sharded_param_consistency.py`, we will test whether
     # level 1 and 2 will produce exactly the same results
+    pg = ProcessGroup()
     zero_optimizer = LowLevelZeroOptimizer(zero_optimizer,
+                                           pg=pg,
                                            overlap_communication=True,
                                            initial_scale=1,
                                            reduce_bucket_size=262144)

From 8e85d2440a7d980a37431110ed583260d6cca7fe Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Fri, 13 Jan 2023 13:31:27 +0800
Subject: [PATCH 173/209] [example] update vit ci script (#2469)

* [example] update vit ci script

* [example] update requirements

* [example] update requirements
---
 examples/images/vit/configs/vit_1d_tp2_ci.py | 32 ++++++++++++++++++++
 examples/images/vit/requirements.txt         |  6 ++++
 examples/images/vit/test_ci.sh               |  9 ++++++
 examples/images/vit/train.py                 | 25 +++++++++++----
 examples/images/vit/vit.py                   | 23 ++++++++------
 5 files changed, 79 insertions(+), 16 deletions(-)
 create mode 100644 examples/images/vit/configs/vit_1d_tp2_ci.py
 create mode 100644 examples/images/vit/test_ci.sh

diff --git a/examples/images/vit/configs/vit_1d_tp2_ci.py b/examples/images/vit/configs/vit_1d_tp2_ci.py
new file mode 100644
index 000000000000..e491e4ada45e
--- /dev/null
+++ b/examples/images/vit/configs/vit_1d_tp2_ci.py
@@ -0,0 +1,32 @@
+from colossalai.amp import AMP_TYPE
+
+# hyperparameters
+# BATCH_SIZE is as per GPU
+# global batch size = BATCH_SIZE x data parallel size
+BATCH_SIZE = 8
+LEARNING_RATE = 3e-3
+WEIGHT_DECAY = 0.3
+NUM_EPOCHS = 3
+WARMUP_EPOCHS = 1
+
+# model config
+IMG_SIZE = 224
+PATCH_SIZE = 16
+HIDDEN_SIZE = 32
+DEPTH = 2
+NUM_HEADS = 4
+MLP_RATIO = 4
+NUM_CLASSES = 10
+CHECKPOINT = False
+SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE)**2 + 1    # add 1 for cls token
+
+USE_DDP = True
+TP_WORLD_SIZE = 2
+TP_TYPE = 'row'
+parallel = dict(tensor=dict(mode="1d", size=TP_WORLD_SIZE),)
+
+fp16 = dict(mode=AMP_TYPE.NAIVE)
+clip_grad_norm = 1.0
+gradient_accumulation = 2
+
+LOG_PATH = "./log_ci"
diff --git a/examples/images/vit/requirements.txt b/examples/images/vit/requirements.txt
index 137a69e80498..1f69794ebe70 100644
--- a/examples/images/vit/requirements.txt
+++ b/examples/images/vit/requirements.txt
@@ -1,2 +1,8 @@
 colossalai >= 0.1.12
 torch >= 1.8.1
+numpy>=1.24.1
+timm>=0.6.12
+titans>=0.0.7
+tqdm>=4.61.2
+transformers>=4.25.1
+nvidia-dali-cuda110>=1.8.0 --extra-index-url https://developer.download.nvidia.com/compute/redist
diff --git a/examples/images/vit/test_ci.sh b/examples/images/vit/test_ci.sh
new file mode 100644
index 000000000000..41d25ee23521
--- /dev/null
+++ b/examples/images/vit/test_ci.sh
@@ -0,0 +1,9 @@
+export OMP_NUM_THREADS=4
+
+pip install -r requirements.txt
+
+# train
+colossalai run \
+--nproc_per_node 4 train.py \
+--config configs/vit_1d_tp2_ci.py \
+--dummy_data
diff --git a/examples/images/vit/train.py b/examples/images/vit/train.py
index de39801c7972..0b4489244368 100644
--- a/examples/images/vit/train.py
+++ b/examples/images/vit/train.py
@@ -7,6 +7,7 @@
 from timm.models.vision_transformer import _create_vision_transformer
 from titans.dataloader.imagenet import build_dali_imagenet
 from tqdm import tqdm
+from vit import DummyDataLoader
 
 import colossalai
 from colossalai.core import global_context as gpc
@@ -56,8 +57,8 @@ def init_spec_func(model, tp_type):
 def train_imagenet():
 
     parser = colossalai.get_default_parser()
-    parser.add_argument('--from_torch', default=True, action='store_true')
-    parser.add_argument('--resume_from', default=False)
+    parser.add_argument('--resume_from', default=False, action='store_true')
+    parser.add_argument('--dummy_data', default=False, action='store_true')
 
     args = parser.parse_args()
     colossalai.launch_from_torch(config=args.config)
@@ -74,10 +75,22 @@ def train_imagenet():
             logger.log_to_file(log_path)
 
     logger.info('Build data loader', ranks=[0])
-    root = os.environ['DATA']
-    train_dataloader, test_dataloader = build_dali_imagenet(root,
-                                                            train_batch_size=gpc.config.BATCH_SIZE,
-                                                            test_batch_size=gpc.config.BATCH_SIZE)
+    if not args.dummy_data:
+        root = os.environ['DATA']
+        train_dataloader, test_dataloader = build_dali_imagenet(root,
+                                                                train_batch_size=gpc.config.BATCH_SIZE,
+                                                                test_batch_size=gpc.config.BATCH_SIZE)
+    else:
+        train_dataloader = DummyDataLoader(length=10,
+                                           batch_size=gpc.config.BATCH_SIZE,
+                                           category=gpc.config.NUM_CLASSES,
+                                           image_size=gpc.config.IMG_SIZE,
+                                           return_dict=False)
+        test_dataloader = DummyDataLoader(length=5,
+                                          batch_size=gpc.config.BATCH_SIZE,
+                                          category=gpc.config.NUM_CLASSES,
+                                          image_size=gpc.config.IMG_SIZE,
+                                          return_dict=False)
 
     logger.info('Build model', ranks=[0])
 
diff --git a/examples/images/vit/vit.py b/examples/images/vit/vit.py
index 14c870b39268..f22e8ea90cec 100644
--- a/examples/images/vit/vit.py
+++ b/examples/images/vit/vit.py
@@ -32,21 +32,24 @@ def __len__(self):
 
 
 class DummyDataLoader(DummyDataGenerator):
-    batch_size = 4
-    channel = 3
-    category = 8
-    image_size = 224
+
+    def __init__(self, length=10, batch_size=4, channel=3, category=8, image_size=224, return_dict=True):
+        super().__init__(length)
+        self.batch_size = batch_size
+        self.channel = channel
+        self.category = category
+        self.image_size = image_size
+        self.return_dict = return_dict
 
     def generate(self):
         image_dict = {}
-        image_dict['pixel_values'] = torch.rand(DummyDataLoader.batch_size,
-                                                DummyDataLoader.channel,
-                                                DummyDataLoader.image_size,
-                                                DummyDataLoader.image_size,
-                                                device=get_current_device()) * 2 - 1
-        image_dict['label'] = torch.randint(DummyDataLoader.category, (DummyDataLoader.batch_size,),
+        image_dict['pixel_values'] = torch.rand(
+            self.batch_size, self.channel, self.image_size, self.image_size, device=get_current_device()) * 2 - 1
+        image_dict['label'] = torch.randint(self.category, (self.batch_size,),
                                             dtype=torch.int64,
                                             device=get_current_device())
+        if not self.return_dict:
+            return image_dict['pixel_values'], image_dict['label']
         return image_dict
 
 
From 8b7495dd541ea12e1af84b3a3a0e24abc1e847d1 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Fri, 13 Jan 2023 14:40:05 +0800
Subject: [PATCH 174/209] [example] integrate seq-parallel tutorial with CI
 (#2463)

---
 .../kernel/cuda_native/scaled_softmax.py      |  17 ++-
 .../tutorial/large_batch_optimizer/README.md  |   8 +-
 examples/tutorial/sequence_parallel/README.md | 141 ++++--------------
 examples/tutorial/sequence_parallel/config.py |  15 +-
 .../sequence_parallel/requirements.txt        |   4 +-
 .../tutorial/sequence_parallel/test_ci.sh     |   7 +
 examples/tutorial/sequence_parallel/train.py  |  44 ++----
 7 files changed, 69 insertions(+), 167 deletions(-)
 create mode 100644 examples/tutorial/sequence_parallel/test_ci.sh

diff --git a/colossalai/kernel/cuda_native/scaled_softmax.py b/colossalai/kernel/cuda_native/scaled_softmax.py
index 3f0260aaed87..44d750c5cbde 100644
--- a/colossalai/kernel/cuda_native/scaled_softmax.py
+++ b/colossalai/kernel/cuda_native/scaled_softmax.py
@@ -114,6 +114,13 @@ def __init__(
         self.softmax_in_fp32 = softmax_in_fp32
         self.scale = scale
 
+        try:
+            from colossalai._C import scaled_masked_softmax
+        except ImportError:
+            from colossalai.kernel.op_builder.scaled_masked_softmax import ScaledMaskedSoftmaxBuilder
+            scaled_masked_softmax = ScaledMaskedSoftmaxBuilder().load()
+        self.scaled_masked_softmax = scaled_masked_softmax
+
         assert (self.scale is None or softmax_in_fp32), "softmax should be in fp32 when scaled"
 
     def forward(self, input, mask):
@@ -178,11 +185,5 @@ def forward_torch_softmax(self, input, mask):
 
         return probs
 
-    @staticmethod
-    def get_batch_per_block(sq, sk, b, np):
-        try:
-            import colossalai._C.scaled_masked_softmax
-        except ImportError:
-            raise RuntimeError('ScaledMaskedSoftmax requires cuda extensions')
-
-        return colossalai._C.scaled_masked_softmax.get_batch_per_block(sq, sk, b, np)
+    def get_batch_per_block(self, sq, sk, b, np):
+        return self.scaled_masked_softmax.get_batch_per_block(sq, sk, b, np)
diff --git a/examples/tutorial/large_batch_optimizer/README.md b/examples/tutorial/large_batch_optimizer/README.md
index d85afa427518..1a17c2d8740f 100644
--- a/examples/tutorial/large_batch_optimizer/README.md
+++ b/examples/tutorial/large_batch_optimizer/README.md
@@ -1,9 +1,11 @@
-# Comparison of Large Batch Training Optimization
+# Large Batch Training Optimization
 
 ## Table of contents
 
-- [Overview](#-overview)
-- [Quick Start](#-quick-start)
+- [Large Batch Training Optimization](#large-batch-training-optimization)
+  - [Table of contents](#table-of-contents)
+  - [📚 Overview](#-overview)
+  - [🚀 Quick Start](#-quick-start)
 
 ## 📚 Overview
 
diff --git a/examples/tutorial/sequence_parallel/README.md b/examples/tutorial/sequence_parallel/README.md
index 7058f53db8b6..1b7c60e22861 100644
--- a/examples/tutorial/sequence_parallel/README.md
+++ b/examples/tutorial/sequence_parallel/README.md
@@ -1,139 +1,56 @@
-# Sequence Parallelism with BERT
+# Sequence Parallelism
 
-In this example, we implemented BERT with sequence parallelism. Sequence parallelism splits the input tensor and intermediate
-activation along the sequence dimension. This method can achieve better memory efficiency and allows us to train with larger batch size and longer sequence length.
+## Table of contents
 
-Paper: [Sequence Parallelism: Long Sequence Training from System Perspective](https://arxiv.org/abs/2105.13120)
+- [Sequence Parallelism](#sequence-parallelism)
+  - [Table of contents](#table-of-contents)
+  - [📚 Overview](#-overview)
+  - [🚀 Quick Start](#-quick-start)
+  - [🏎 How to Train with Sequence Parallelism](#-how-to-train-with-sequence-parallelism)
+    - [Step 1. Configure your parameters](#step-1-configure-your-parameters)
+    - [Step 2. Invoke parallel training](#step-2-invoke-parallel-training)
 
-## 🚀Quick Start
-1. Run with the following command
-```bash
-export PYTHONPATH=$PWD
-colossalai run --nproc_per_node 4 train.py -s
-```
-2. The default config is sequence parallel size = 2, pipeline size = 1, let’s change pipeline size to be 2 and try it again.
-
-
-## How to Prepare WikiPedia Dataset
-
-First, let's prepare the WikiPedia dataset from scratch. To generate a preprocessed dataset, we need four items:
-1. raw WikiPedia dataset
-2. wikipedia extractor (extract data from the raw dataset)
-3. vocabulary file
-4. preprocessing scripts (generate final data from extracted data)
-
-For the preprocessing script, we thank Megatron-LM for providing a preprocessing script to generate the corpus file.
-
-```python
-# download raw data
-mkdir data && cd ./data
-wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
-
-# install wiki extractor
-git clone https://github.com/FrankLeeeee/wikiextractor.git
-pip install ./wikiextractor
-
-# extractmodule
-wikiextractor --json enwiki-latest-pages-articles.xml.bz2
-cat text/*/* > ./corpus.json
-cd ..
-
-# download vocab file
-mkdir vocab && cd ./vocab
-wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
-cd ..
-
-# preprocess some data
-git clone https://github.com/NVIDIA/Megatron-LM.git
-cd ./Megatron-LM
-python tools/preprocess_data.py \
-    --input ../data/corpus.json \
-    --output-prefix my-bert \
-    --vocab ../vocab/bert-large-uncased-vocab.txt \
-    --dataset-impl mmap \
-    --tokenizer-type BertWordPieceLowerCase \
-    --split-sentences \
-    --workers 24
-```
+## 📚 Overview
 
-After running the preprocessing scripts, you will obtain two files:
-1. my-bert_text_sentence.bin
-2. my-bert_text_sentence.idx
-
-If you happen to encouter `index out of range` problem when running Megatron's script,
-this is probably because that a sentence starts with a punctuation and cannot be tokenized. A work-around is to update `Encoder.encode` method with the code below:
-
-```python
-class Encoder(object):
-    def __init__(self, args):
-        ...
-
-    def initializer(self):
-        ...
-
-    def encode(self, json_line):
-        data = json.loads(json_line)
-        ids = {}
-        for key in self.args.json_keys:
-            text = data[key]
-            doc_ids = []
-
-            # lsg: avoid sentences which start with a punctuation
-            # as it cannot be tokenized by splitter
-            if len(text) > 0 and text[0] in string.punctuation:
-                text = text[1:]
-
-            for sentence in Encoder.splitter.tokenize(text):
-                sentence_ids = Encoder.tokenizer.tokenize(sentence)
-                if len(sentence_ids) > 0:
-                    doc_ids.append(sentence_ids)
-            if len(doc_ids) > 0 and self.args.append_eod:
-                doc_ids[-1].append(Encoder.tokenizer.eod)
-            ids[key] = doc_ids
-        return ids, len(json_line)
-```
+In this tutorial, we implemented BERT with sequence parallelism. Sequence parallelism splits the input tensor and intermediate
+activation along the sequence dimension. This method can achieve better memory efficiency and allows us to train with larger batch size and longer sequence length.
 
-## How to Train with Sequence Parallelism
+Paper: [Sequence Parallelism: Long Sequence Training from System Perspective](https://arxiv.org/abs/2105.13120)
 
-We provided `train.py` for you to execute training. Before invoking the script, there are several
-steps to perform.
+## 🚀 Quick Start
 
-### Step 1. Set data path and vocab path
+1. Install PyTorch
 
-At the top of `config.py`, you can see two global variables `DATA_PATH` and `VOCAB_FILE_PATH`.
+2. Install the dependencies.
 
-```python
-DATA_PATH = <data-path>
-VOCAB_FILE_PATH = <vocab-path>
+```bash
+pip install -r requirements.txt
 ```
 
-`DATA_PATH` refers to the path to the data file generated by Megatron's script. For example, in the section above, you should get two data files (my-bert_text_sentence.bin and my-bert_text_sentence.idx). You just need to `DATA_PATH` to the path to the bin file without the file extension.
+3. Run with the following command
 
-For example, if your my-bert_text_sentence.bin is /home/Megatron-LM/my-bert_text_sentence.bin, then you should set
+```bash
+export PYTHONPATH=$PWD
 
-```python
-DATA_PATH = '/home/Megatron-LM/my-bert_text_sentence'
+# run with synthetic dataset
+colossalai run --nproc_per_node 4 train.py
 ```
 
-The `VOCAB_FILE_PATH` refers to the path to the vocabulary downloaded when you prepare the dataset
-(e.g. bert-large-uncased-vocab.txt).
+> The default config is sequence parallel size = 2, pipeline size = 1, let’s change pipeline size to be 2 and try it again.
 
-### Step 3. Make Dataset Helper
 
-Build BERT dataset helper. Requirements are `CUDA`, `g++`, `pybind11` and `make`.
+## 🏎 How to Train with Sequence Parallelism
 
-```python
-cd ./data/datasets
-make
-```
+We provided `train.py` for you to execute training. Before invoking the script, there are several
+steps to perform.
 
-### Step 3. Configure your parameters
+### Step 1. Configure your parameters
 
 In the `config.py` provided, a set of parameters are defined including training scheme, model, etc.
 You can also modify the ColossalAI setting. For example, if you wish to parallelize over the
 sequence dimension on 8 GPUs. You can change `size=4` to `size=8`. If you wish to use pipeline parallelism, you can set `pipeline=<num_of_pipeline_stages>`.
 
-### Step 4. Invoke parallel training
+### Step 2. Invoke parallel training
 
 Lastly, you can start training with sequence parallelism. How you invoke `train.py` depends on your
 machine setting.
diff --git a/examples/tutorial/sequence_parallel/config.py b/examples/tutorial/sequence_parallel/config.py
index df0c5282f032..6edf9cc2c7e5 100644
--- a/examples/tutorial/sequence_parallel/config.py
+++ b/examples/tutorial/sequence_parallel/config.py
@@ -1,11 +1,8 @@
 from colossalai.amp import AMP_TYPE
 
-DATA_PATH = ''
-VOCAB_FILE_PATH = ''
-
 # hyper-parameters
-TRAIN_ITERS = 1000000
-DECAY_ITERS = 990000
+TRAIN_ITERS = 10
+DECAY_ITERS = 4
 WARMUP_FRACTION = 0.01
 GLOBAL_BATCH_SIZE = 32    # dp world size * sentences per GPU
 EVAL_ITERS = 10
@@ -13,12 +10,12 @@
 LR = 0.0001
 MIN_LR = 1e-05
 WEIGHT_DECAY = 0.01
-SEQ_LENGTH = 512
+SEQ_LENGTH = 128
 
 # BERT config
-DEPTH = 12
-NUM_ATTENTION_HEADS = 12
-HIDDEN_SIZE = 768
+DEPTH = 4
+NUM_ATTENTION_HEADS = 4
+HIDDEN_SIZE = 128
 
 # model config
 ADD_BINARY_HEAD = False
diff --git a/examples/tutorial/sequence_parallel/requirements.txt b/examples/tutorial/sequence_parallel/requirements.txt
index 137a69e80498..b49a94554afb 100644
--- a/examples/tutorial/sequence_parallel/requirements.txt
+++ b/examples/tutorial/sequence_parallel/requirements.txt
@@ -1,2 +1,2 @@
-colossalai >= 0.1.12
-torch >= 1.8.1
+colossalai
+torch
diff --git a/examples/tutorial/sequence_parallel/test_ci.sh b/examples/tutorial/sequence_parallel/test_ci.sh
new file mode 100644
index 000000000000..7bc20de3b6e4
--- /dev/null
+++ b/examples/tutorial/sequence_parallel/test_ci.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+set -euxo pipefail
+
+pip install -r requirements.txt
+
+# run test
+colossalai run --nproc_per_node 4 train.py
diff --git a/examples/tutorial/sequence_parallel/train.py b/examples/tutorial/sequence_parallel/train.py
index b92061000d10..a89747b5845e 100644
--- a/examples/tutorial/sequence_parallel/train.py
+++ b/examples/tutorial/sequence_parallel/train.py
@@ -1,9 +1,8 @@
 import argparse
 
 import torch
-from data import build_train_valid_test_data_iterators
 from data.bert_helper import SequenceParallelDataIterator, get_batch_for_sequence_parallel
-from data.tokenizer import get_padded_vocab_size, initialize_tokenizer
+from data.dummy_dataloader import DummyDataloader
 from loss_func.bert_loss import BertLoss
 from lr_scheduler import AnnealingLR
 from model.bert import BertForPretrain, build_pipeline_bert
@@ -36,7 +35,7 @@ def parse_args():
 
 
 def pipeline_data_process_func(stage_output, micro_batch_data):
-    tokens, types, sentence_order, loss_mask, lm_labels, padding_mask = micro_batch_data 
+    tokens, types, sentence_order, loss_mask, lm_labels, padding_mask = micro_batch_data
     if gpc.is_first_rank(ParallelMode.PIPELINE):
         data = (tokens, padding_mask, types, lm_labels)
         label = (loss_mask, sentence_order)
@@ -53,36 +52,15 @@ def main():
 
     logger = get_dist_logger()
 
-    # build dataloader
-    if not args.synthetic:
-        initialize_tokenizer(gpc.config.VOCAB_FILE_PATH, tokenizer_type='BertWordPieceLowerCase')
-        VOCAB_SIZE = get_padded_vocab_size()
-        trainloader, validloader, testloader = build_train_valid_test_data_iterators(
-            train_iters=gpc.config.TRAIN_ITERS,
-            global_batch_size=gpc.config.GLOBAL_BATCH_SIZE,
-            eval_interval=gpc.config.EVAL_INTERVAL,
-            eval_iters=gpc.config.EVAL_ITERS,
-            data_prefix=[gpc.config.DATA_PATH],
-            data_impl='mmap',
-            splits_string='949,50,1',
-            max_seq_length=gpc.config.SEQ_LENGTH,
-            masked_lm_prob=0.15,
-            short_seq_prob=0.1,
-            seed=1234,
-            skip_warmup=True,
-            binary_head=False,
-        )
-    else:
-        from data.dummy_dataloader import DummyDataloader
-
-        BATCH_SIZE_PER_GPUS = gpc.config.GLOBAL_BATCH_SIZE // gpc.get_world_size(ParallelMode.DATA)
-        VOCAB_SIZE = 30528
-        trainloader = DummyDataloader(batch_size=BATCH_SIZE_PER_GPUS,
-                                      vocab_size=VOCAB_SIZE,
-                                      seq_length=gpc.config.SEQ_LENGTH)
-        validloader = DummyDataloader(batch_size=BATCH_SIZE_PER_GPUS,
-                                      vocab_size=VOCAB_SIZE,
-                                      seq_length=gpc.config.SEQ_LENGTH)
+    # build synthetic dataloader
+    BATCH_SIZE_PER_GPUS = gpc.config.GLOBAL_BATCH_SIZE // gpc.get_world_size(ParallelMode.DATA)
+    VOCAB_SIZE = 30528
+    trainloader = DummyDataloader(batch_size=BATCH_SIZE_PER_GPUS,
+                                  vocab_size=VOCAB_SIZE,
+                                  seq_length=gpc.config.SEQ_LENGTH)
+    validloader = DummyDataloader(batch_size=BATCH_SIZE_PER_GPUS,
+                                  vocab_size=VOCAB_SIZE,
+                                  seq_length=gpc.config.SEQ_LENGTH)
 
     logger.info("Dataloaders are built", ranks=[0])
 

From a5dc4253c6ea5ea1d9a5529a379eb2ffdf81622b Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Fri, 13 Jan 2023 14:56:17 +0800
Subject: [PATCH 175/209] [zero] polish low level optimizer (#2473)

---
 colossalai/zero/sharded_optim/_utils.py       |  30 ++--
 .../sharded_optim/bookkeeping/base_store.py   |  17 +--
 .../sharded_optim/bookkeeping/bucket_store.py |   8 +-
 .../bookkeeping/parameter_store.py            |   9 +-
 .../zero/sharded_optim/low_level_optim.py     | 135 +++++++++---------
 .../test_zero/low_level_zero/test_grad_acc.py |  12 +-
 .../test_zero/low_level_zero/test_zero1_2.py  |   8 +-
 7 files changed, 95 insertions(+), 124 deletions(-)

diff --git a/colossalai/zero/sharded_optim/_utils.py b/colossalai/zero/sharded_optim/_utils.py
index 7369f8a2edde..70d9c040cb53 100644
--- a/colossalai/zero/sharded_optim/_utils.py
+++ b/colossalai/zero/sharded_optim/_utils.py
@@ -103,7 +103,11 @@ def split_half_float_double(tensor_list):
     return buckets
 
 
-def reduce_tensor_dp_group(tensor, dtype=None, dst_rank=None, pg: Optional[ProcessGroup] = None):
+def reduce_tensor_dp_group(tensor: torch.Tensor,
+                           dtype: Optional[torch.dtype] = None,
+                           dst_local_rank: Optional[int] = None,
+                           dst_global_rank: Optional[int] = None,
+                           group: Optional[dist.ProcessGroup] = None):
     """
     Reduce the tensor in the data parallel process group
 
@@ -128,36 +132,22 @@ def reduce_tensor_dp_group(tensor, dtype=None, dst_rank=None, pg: Optional[Proce
     else:
         tensor_to_reduce = tensor
 
-    if isinstance(pg, ProcessGroup):
-        group = pg.dp_process_group()
-        world_size = pg.dp_world_size()
-    else:
-        world_size = gpc.get_world_size(ParallelMode.DATA)
-        group = gpc.get_group(ParallelMode.DATA)
-
+    world_size = dist.get_world_size(group=group)
     tensor_to_reduce.div_(world_size)
 
     # if rank is None, all reduce will be used
     # else, reduce is used
-    use_all_reduce = dst_rank is None
+    use_all_reduce = dst_local_rank is None
 
     if use_all_reduce:
         dist.all_reduce(tensor_to_reduce, group=group)
     else:
-        if pg is not None:
-            ranks_in_group = pg.dp_rank_list()
-        else:
-            ranks_in_group = gpc.get_ranks_in_group(ParallelMode.DATA)
-        global_rank = ranks_in_group[dst_rank]
-        dist.reduce(tensor=tensor_to_reduce, dst=global_rank, group=group)
+        dist.reduce(tensor=tensor_to_reduce, dst=dst_global_rank, group=group)
 
     # recover the original dtype
     if tensor.dtype != dtype and tensor is not tensor_to_reduce:
-        if pg is not None:
-            local_rank = pg.dp_local_rank()
-        else:
-            local_rank = gpc.get_local_rank(ParallelMode.DATA)
-        if use_all_reduce or dst_rank == local_rank:
+        local_rank = dist.get_rank(group=group)
+        if use_all_reduce or dst_local_rank == local_rank:
             tensor.copy_(tensor_to_reduce)
 
     return tensor
diff --git a/colossalai/zero/sharded_optim/bookkeeping/base_store.py b/colossalai/zero/sharded_optim/bookkeeping/base_store.py
index 3623ed1f048c..2ebd122464f4 100644
--- a/colossalai/zero/sharded_optim/bookkeeping/base_store.py
+++ b/colossalai/zero/sharded_optim/bookkeeping/base_store.py
@@ -1,19 +1,12 @@
-from typing import Optional
-
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.tensor import ProcessGroup
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
 
 
 class BaseStore:
 
-    def __init__(self, pg: Optional[ProcessGroup] = None):
-        if isinstance(pg, ProcessGroup):
-            self._world_size = pg.dp_world_size()
-            self._local_rank = pg.dp_local_rank()
-        else:
-            self._world_size = gpc.get_world_size(ParallelMode.DATA)
-            self._local_rank = gpc.get_local_rank(ParallelMode.DATA)
+    def __init__(self, torch_pg: ProcessGroup):
+        self._world_size = dist.get_world_size(group=torch_pg)
+        self._local_rank = dist.get_rank(group=torch_pg)
 
     @property
     def world_size(self):
diff --git a/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py b/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py
index aba61624e46e..9e0c05d8941a 100644
--- a/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py
+++ b/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py
@@ -1,14 +1,12 @@
-from typing import Optional
-
-from colossalai.tensor import ProcessGroup
+from torch.distributed import ProcessGroup
 
 from .base_store import BaseStore
 
 
 class BucketStore(BaseStore):
 
-    def __init__(self, pg: Optional[ProcessGroup] = None):
-        super().__init__(pg)
+    def __init__(self, torch_pg: ProcessGroup):
+        super().__init__(torch_pg)
         self._grads = dict()
         self._params = dict()
         self._num_elements_in_bucket = dict()
diff --git a/colossalai/zero/sharded_optim/bookkeeping/parameter_store.py b/colossalai/zero/sharded_optim/bookkeeping/parameter_store.py
index c22186abee0f..cbf708b3471f 100644
--- a/colossalai/zero/sharded_optim/bookkeeping/parameter_store.py
+++ b/colossalai/zero/sharded_optim/bookkeeping/parameter_store.py
@@ -1,16 +1,15 @@
-from typing import List, Optional
+from typing import List
 
 from torch import Tensor
-
-from colossalai.tensor import ProcessGroup
+from torch.distributed import ProcessGroup
 
 from .base_store import BaseStore
 
 
 class ParameterStore(BaseStore):
 
-    def __init__(self, pg: Optional[ProcessGroup] = None):
-        super().__init__(pg)
+    def __init__(self, torch_pg: ProcessGroup):
+        super().__init__(torch_pg)
         # param partitioning data structures
         self._fp16_param_to_rank = dict()
         self._rank_groupid_to_fp16_param_list = dict()
diff --git a/colossalai/zero/sharded_optim/low_level_optim.py b/colossalai/zero/sharded_optim/low_level_optim.py
index e372eaa50be4..38736d01afef 100644
--- a/colossalai/zero/sharded_optim/low_level_optim.py
+++ b/colossalai/zero/sharded_optim/low_level_optim.py
@@ -10,7 +10,7 @@
 from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
 from colossalai.nn.optimizer import ColossalaiOptimizer
-from colossalai.tensor import ProcessGroup
+from colossalai.tensor import ColoParameter, ProcessGroup
 from colossalai.utils.cuda import get_current_device
 
 from ._utils import (
@@ -34,32 +34,21 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
     def __init__(
             self,
             optimizer: Optimizer,
-            pg: Optional[ProcessGroup] = None,
-    # grad scaler config
-            initial_scale=2**16,
-            min_scale=1,
-            growth_factor=2,
-            backoff_factor=0.5,
-            growth_interval=2000,
-            hysteresis=2,
+            initial_scale: int = 2**16,    # grad scaler config
+            min_scale: int = 1,
+            growth_factor: float = 2.,
+            backoff_factor: float = .5,
+            growth_interval: int = 2000,
+            hysteresis: int = 2,
             max_scale: int = 2**24,
-
-    # grad clipping
-            clip_grad_norm=0.0,
-            verbose=False,
-
-    # communication
-            reduce_bucket_size=1024 * 1024,
-            communication_dtype=None,
-            overlap_communication=False,
-
-    # stage 2
-            partition_grad=False,
-    # cpu offload
-            cpu_offload=False,
-
-    # forced dtype
-            forced_dtype=None):
+            clip_grad_norm: float = 0.0,    # grad clipping
+            verbose: bool = False,
+            reduce_bucket_size: int = 1024 * 1024,    # communication
+            communication_dtype: Optional[torch.dtype] = None,
+            overlap_communication: bool = False,
+            partition_grad: bool = False,    # stage 2
+            cpu_offload: bool = False,    # cpu offload
+            forced_dtype: Optional[torch.dtype] = None):
 
         # TODO: add support for
         # 1. fp16 master weights
@@ -76,16 +65,16 @@ def __init__(
 
         self._cpu_offload = cpu_offload
 
-        self._pg = pg
-        if isinstance(pg, ProcessGroup):
-            self._local_rank = pg.dp_local_rank()
-            self._world_size = pg.dp_world_size()
-            self._dp_group = pg.dp_process_group()
-            if pg.tp_world_size() > 1:
-                self._mp_group = pg.tp_process_group()
-            else:
-                self._mp_group = None
-        elif pg is None:
+        colo_pg = self._search_colo_process_group()
+        if isinstance(colo_pg, ProcessGroup):
+            self._local_rank = colo_pg.dp_local_rank()
+            self._world_size = colo_pg.dp_world_size()
+            self._dp_global_ranks = colo_pg.get_ranks_in_dp()
+            self._dp_torch_group = colo_pg.dp_process_group()
+            self._mp_torch_group = None
+            if colo_pg.tp_world_size() > 1:
+                self._mp_torch_group = colo_pg.tp_process_group()
+        elif colo_pg is None:
             dp_parallel_mode = ParallelMode.DATA
             mp_parallel_mode = ParallelMode.MODEL
 
@@ -93,14 +82,13 @@ def __init__(
             self._mp_parallel_mode = mp_parallel_mode
             self._local_rank = gpc.get_local_rank(dp_parallel_mode)
             self._world_size = gpc.get_world_size(dp_parallel_mode)
-
-            self._dp_group = gpc.get_group(dp_parallel_mode)
+            self._dp_global_ranks = gpc.get_ranks_in_group(dp_parallel_mode)
+            self._dp_torch_group = gpc.get_group(dp_parallel_mode)
+            self._mp_torch_group = None
             if gpc.is_initialized(mp_parallel_mode) and gpc.get_world_size(mp_parallel_mode) > 1:
-                self._mp_group = gpc.get_group(mp_parallel_mode)
-            else:
-                self._mp_group = None
+                self._mp_torch_group = gpc.get_group(mp_parallel_mode)
         else:
-            raise TypeError(f"pg should be None or a ProcesGroup")
+            raise NotImplementedError
         # fp16 and fp32 params for mixed precision training
         self._fp16_param_groups = dict()
         self._fp32_flat_param_groups_of_current_rank = dict()
@@ -136,14 +124,9 @@ def __init__(
 
         # ParameterStore will manage the tensor buffers used for zero
         # it will not manage the tensors used by mixed precision training
-        if self._pg is not None:
-            self._param_store = ParameterStore(self._pg)
-            self._grad_store = GradientStore(self._pg)
-            self._bucket_store = BucketStore(self._pg)
-        else:
-            self._param_store = ParameterStore(self._dp_parallel_mode)
-            self._grad_store = GradientStore(self._dp_parallel_mode)
-            self._bucket_store = BucketStore(self._dp_parallel_mode)
+        self._param_store = ParameterStore(self._dp_torch_group)
+        self._grad_store = GradientStore(self._dp_torch_group)
+        self._bucket_store = BucketStore(self._dp_torch_group)
 
         # iterate over the param group in the optimizer
         # partition these param groups for data parallel training
@@ -224,6 +207,30 @@ def loss_scale(self):
     def num_param_groups(self):
         return len(self._fp16_param_groups)
 
+    def _sanity_checks(self):
+        assert torch.cuda.is_available(), 'CUDA is required'
+        for param_group in self.optim.param_groups:
+            group_params = param_group['params']
+            for param in group_params:
+                assert param.dtype == self._dtype, \
+                    f"Parameters are expected to have the same dtype `{self._dtype}`, but got `{param.dtype}`"
+
+    def _search_colo_process_group(self):
+        colo_flag = False
+        colo_pg = None
+        for param_group in self.optim.param_groups:
+            group_params = param_group['params']
+            for param in group_params:
+                if isinstance(param, ColoParameter):
+                    colo_flag = True
+                    if colo_pg is None:
+                        colo_pg = param.get_process_group()
+                    else:
+                        assert colo_pg == param.get_process_group(), "All parameters should be in a same process group"
+                elif colo_flag:
+                    raise RuntimeError("All parameters should be ColoParameter if you use ColoParameter.")
+        return colo_pg
+
     def _partition_param_list(self, param_list):
         params_per_rank = [[] for _ in range(self._world_size)]
         numel_per_rank = [0 for _ in range(self._world_size)]
@@ -241,14 +248,6 @@ def _partition_param_list(self, param_list):
             self._logger.info(f'Number of elements on ranks: {numel_per_rank}', ranks=[0])
         return params_per_rank
 
-    def _sanity_checks(self):
-        assert torch.cuda.is_available(), 'CUDA is required'
-        for param_group in self.optim.param_groups:
-            group_params = param_group['params']
-            for param in group_params:
-                assert param.dtype == self._dtype, \
-                    f"Parameters are expected to have the same dtype `{self._dtype}`, but got `{param.dtype}`"
-
     ###########################################################
     # Backward Reduction Hook
     ###########################################################
@@ -384,10 +383,14 @@ def _reduce_and_copy(self, bucket: TensorBucket, reduce_rank):
 
         with torch.cuda.stream(stream):
             flat = bucket.flatten()
+            reduce_global_rank = None
+            if reduce_rank is not None:
+                reduce_global_rank = self._dp_global_ranks[reduce_rank]
             reduced_flat = reduce_tensor_dp_group(tensor=flat,
                                                   dtype=self._communication_dtype,
-                                                  dst_rank=reduce_rank,
-                                                  pg=self._pg)
+                                                  dst_local_rank=reduce_rank,
+                                                  dst_global_rank=reduce_global_rank,
+                                                  group=self._dp_torch_group)
 
             # update the reduced tensor
             if reduce_rank is None or reduce_rank == self._local_rank:
@@ -456,8 +459,8 @@ def step(self, closure=None):
             norm_group = compute_norm(gradients=self._grad_store._averaged_gradients[group_id],
                                       params=self._param_store.get_fp16_params_by_rank_group(group_id=group_id,
                                                                                              rank=self._local_rank),
-                                      dp_group=self._dp_group,
-                                      mp_group=self._mp_group)
+                                      dp_group=self._dp_torch_group,
+                                      mp_group=self._mp_torch_group)
             norm_groups.append(norm_group)
 
             # create flat gradient for the flat fp32 params
@@ -497,7 +500,7 @@ def step(self, closure=None):
         for group_id in range(self.num_param_groups):
             for rank in range(self._world_size):
                 fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(rank=rank, group_id=group_id)
-                handle = dist.broadcast(fp16_param, src=rank, group=self._dp_group, async_op=True)
+                handle = dist.broadcast(fp16_param, src=rank, group=self._dp_torch_group, async_op=True)
                 handles.append(handle)
 
         for handle in handles:
@@ -519,11 +522,11 @@ def _check_overflow(self):
                     break
 
         # all-reduce across dp group
-        dist.all_reduce(self._found_overflow, op=dist.ReduceOp.MAX, group=self._dp_group)
+        dist.all_reduce(self._found_overflow, op=dist.ReduceOp.MAX, group=self._dp_torch_group)
 
         # all-reduce over model parallel group
-        if self._mp_group:
-            dist.all_reduce(self._found_overflow, op=dist.ReduceOp.MAX, group=self._mp_group)
+        if self._mp_torch_group:
+            dist.all_reduce(self._found_overflow, op=dist.ReduceOp.MAX, group=self._mp_torch_group)
 
         if self._found_overflow.item() > 0:
             return True
diff --git a/tests/test_zero/low_level_zero/test_grad_acc.py b/tests/test_zero/low_level_zero/test_grad_acc.py
index a0d1ac531485..69795ed6a2e5 100644
--- a/tests/test_zero/low_level_zero/test_grad_acc.py
+++ b/tests/test_zero/low_level_zero/test_grad_acc.py
@@ -35,18 +35,15 @@ def exam_zero_1_2_grad_acc():
     # create model
     zero1_model = TestModel().cuda()
     zero2_model = copy.deepcopy(zero1_model)
-    pg = ProcessGroup()
     # create optimizer
     zero1_optimizer = torch.optim.Adam(zero1_model.parameters(), lr=1)
     zero2_optimizer = torch.optim.Adam(zero2_model.parameters(), lr=1)
     zero1_optimizer = LowLevelZeroOptimizer(zero1_optimizer,
-                                            pg=pg,
                                             overlap_communication=True,
                                             initial_scale=32,
                                             clip_grad_norm=1.0,
                                             verbose=True)
     zero2_optimizer = LowLevelZeroOptimizer(zero2_optimizer,
-                                            pg=pg,
                                             overlap_communication=True,
                                             partition_grad=True,
                                             initial_scale=32,
@@ -86,7 +83,7 @@ def fwd_bwd_func(number, cur_data):
         assert torch.equal(z1p.data, z2p.data)
 
 
-def exam_zero_1_grad_acc(use_pg=True):
+def exam_zero_1_grad_acc():
     local_rank = torch.distributed.get_rank()
     grad_scale = 32
     seed_all(2008)
@@ -105,9 +102,7 @@ def exam_zero_1_grad_acc(use_pg=True):
     # we only test stage 1 here
     # in `check_sharded_param_consistency.py`, we will test whether
     # level 1 and 2 will produce exactly the same results
-    pg = ProcessGroup() if use_pg else None    #ProcessGroup()
     zero_optimizer = LowLevelZeroOptimizer(zero_optimizer,
-                                           pg=pg,
                                            overlap_communication=False,
                                            initial_scale=grad_scale,
                                            reduce_bucket_size=262144,
@@ -158,9 +153,8 @@ def fwd_bwd_func(number, cur_data, check_flag):
 def run_dist(rank, world_size, port):
     colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
 
-    exam_zero_1_grad_acc(True)
-    exam_zero_1_grad_acc(False)
-    # exam_zero_1_2_grad_acc()
+    exam_zero_1_grad_acc()
+    exam_zero_1_2_grad_acc()
 
 
 @pytest.mark.dist
diff --git a/tests/test_zero/low_level_zero/test_zero1_2.py b/tests/test_zero/low_level_zero/test_zero1_2.py
index 6924827fe4b4..8771bfbe6049 100644
--- a/tests/test_zero/low_level_zero/test_zero1_2.py
+++ b/tests/test_zero/low_level_zero/test_zero1_2.py
@@ -9,7 +9,6 @@
 from torch.testing import assert_close
 
 import colossalai
-from colossalai.tensor import ProcessGroup
 from colossalai.testing.random import seed_all
 from colossalai.utils import free_port
 from colossalai.zero import LowLevelZeroOptimizer
@@ -59,17 +58,14 @@ def exam_zero_1_2():
     zero1_model = TestModel().cuda()
     zero2_model = copy.deepcopy(zero1_model)
 
-    pg = ProcessGroup()
     # create optimizer
     zero1_optimizer = torch.optim.Adam(zero1_model.parameters(), lr=1)
     zero2_optimizer = torch.optim.Adam(zero2_model.parameters(), lr=1)
     zero1_optimizer = LowLevelZeroOptimizer(zero1_optimizer,
-                                            pg=pg,
                                             overlap_communication=True,
                                             initial_scale=128,
                                             verbose=True)
     zero2_optimizer = LowLevelZeroOptimizer(zero2_optimizer,
-                                            pg=pg,
                                             overlap_communication=True,
                                             partition_grad=True,
                                             initial_scale=128)
@@ -119,7 +115,7 @@ def exam_zero_1_torch_ddp():
     torch_model = copy.deepcopy(zero_model)
 
     zero_model = zero_model.cuda().half()
-    # torch_model = DDP(torch_model.cuda(), bucket_cap_mb=0)
+    torch_model = DDP(torch_model.cuda(), bucket_cap_mb=0)
     torch_model = torch_model.cuda()
 
     # for (n, p), z1p in zip(torch_model.named_parameters(), zero_model.parameters()):
@@ -131,9 +127,7 @@ def exam_zero_1_torch_ddp():
     # we only test stage 1 here
     # in `check_sharded_param_consistency.py`, we will test whether
     # level 1 and 2 will produce exactly the same results
-    pg = ProcessGroup()
     zero_optimizer = LowLevelZeroOptimizer(zero_optimizer,
-                                           pg=pg,
                                            overlap_communication=True,
                                            initial_scale=1,
                                            reduce_bucket_size=262144)

From fef5c949c35b1f1e0075a9e4abb23a5ec0f48e3c Mon Sep 17 00:00:00 2001
From: Ziyue Jiang <ziyue.jiang97@gmail.com>
Date: Fri, 13 Jan 2023 16:56:01 +0800
Subject: [PATCH 176/209] polish pp middleware (#2476)

Co-authored-by: Ziyue Jiang <ziyue.jiang@gmail.com>
---
 colossalai/pipeline/rpc/_pipeline_base.py                     | 4 ++--
 colossalai/pipeline/rpc/_pipeline_schedule.py                 | 3 ---
 .../gpt/experiments/pipeline_parallel/train_gpt_pp.py         | 2 +-
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/colossalai/pipeline/rpc/_pipeline_base.py b/colossalai/pipeline/rpc/_pipeline_base.py
index 4739cdaa9bd3..1edc1ac70d20 100644
--- a/colossalai/pipeline/rpc/_pipeline_base.py
+++ b/colossalai/pipeline/rpc/_pipeline_base.py
@@ -211,7 +211,7 @@ def _get_output_all(self, key: UniqueKey, ref_use=False, rank=None):
                 refcount = 0
 
             with self.output_list_condition_lock:
-                if refcount < lifecycle:
+                if refcount <= lifecycle:
                     self.output_list[key] = output_work_item
                     self.output_list_condition_lock.notify_all()
 
@@ -390,7 +390,7 @@ def _subscribe_producer(self, microbatch_id: int, forward_only: bool):
                         subscribe_forward_futures[target_index] = []
                     else:
                         subscribe_forward_futures[target_index] = producer_worker_rref.rpc_async().get_output_by_key(
-                            producer_output_key, rank=self.pp_rank)
+                            producer_output_key, rank=self.pp_rank, offsets=offsets)
 
             else:
                 for i in range(producer_num):
diff --git a/colossalai/pipeline/rpc/_pipeline_schedule.py b/colossalai/pipeline/rpc/_pipeline_schedule.py
index e6aa961f19bc..0d572231d378 100644
--- a/colossalai/pipeline/rpc/_pipeline_schedule.py
+++ b/colossalai/pipeline/rpc/_pipeline_schedule.py
@@ -29,9 +29,6 @@ def _get_work_item_key(self) -> UniqueKey:
 
         target_key = UniqueKey(target_microbatch_id, target_phase)
 
-        with self.work_list_condition_lock:
-            self.work_list_condition_lock.wait_for(lambda: target_key in self.work_list)
-
         return target_key
 
 
diff --git a/examples/language/gpt/experiments/pipeline_parallel/train_gpt_pp.py b/examples/language/gpt/experiments/pipeline_parallel/train_gpt_pp.py
index 79efa61b0783..c3451c18db8f 100644
--- a/examples/language/gpt/experiments/pipeline_parallel/train_gpt_pp.py
+++ b/examples/language/gpt/experiments/pipeline_parallel/train_gpt_pp.py
@@ -120,7 +120,7 @@ def run_master(args):
         logger.info(f'{rank=} numel in the partition:{numel}')
 
     # build optim
-    pp_engine.initialize_optimizer(HybridAdam, lr=1e-3)
+    pp_engine.initialize_optimizer(torch.optim.Adam, lr=1e-3)
 
     ranks_tflops = {}
     for n in range(NUM_STEPS):

From f525d1f528dc25518c931f9e1f294787cf1b59b6 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Fri, 13 Jan 2023 22:37:31 +0800
Subject: [PATCH 177/209] [example] update gpt gemini example ci test (#2477)

---
 .../language/gpt/gemini/train_gpt_demo.py     |  5 ++--
 examples/language/gpt/test_ci.sh              | 27 +++++++++----------
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py
index 7bec980f95bd..f77be12d2d05 100644
--- a/examples/language/gpt/gemini/train_gpt_demo.py
+++ b/examples/language/gpt/gemini/train_gpt_demo.py
@@ -65,6 +65,7 @@ def parse_args():
         default="gpt2_medium",
         help="model model scale",
     )
+    parser.add_argument("--steps", type=int, default=10, help="num of training steps")
     args = parser.parse_args()
     return args
 
@@ -236,7 +237,7 @@ def main():
     SEQ_LEN = 1024
     VOCAB_SIZE = 50257
 
-    NUM_STEPS = 10
+    NUM_STEPS = args.steps
     WARMUP_STEPS = 1
     assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps"
     assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median "
@@ -290,14 +291,12 @@ def main():
             from torch.distributed.optim import ZeroRedundancyOptimizer
             optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=torch.optim.Adam, lr=0.01)
     elif args.distplan.startswith("zero"):
-        pg = ProcessGroup()
         model = model.half()
         partition_flag = (args.distplan == "zero2")
         optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
 
         optimizer = LowLevelZeroOptimizer(
             optimizer,
-            pg=pg,
             reduce_bucket_size=12 * 1024 * 1024,
             overlap_communication=True,
             partition_grad=partition_flag,
diff --git a/examples/language/gpt/test_ci.sh b/examples/language/gpt/test_ci.sh
index ad0cfa325d37..d04ece182016 100644
--- a/examples/language/gpt/test_ci.sh
+++ b/examples/language/gpt/test_ci.sh
@@ -1,16 +1,15 @@
 pip install -r requirements.txt
 
-# distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"]
-export DISTPAN="colossalai"
-
-# The following options only valid when DISTPAN="colossalai"
-export TPDEGREE=2
-export GPUNUM=4
-export PLACEMENT='cpu'
-export USE_SHARD_INIT=False
-export BATCH_SIZE=8
-export MODEL_TYPE="gpt2_medium"
-
-
-mkdir -p logs
-torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --model_type=${MODEL_TYPE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee ./logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}.log
+# test colossalai
+for TP in 1 2; do
+    for PLACEMENT in "cpu" "cuda" "auto" "const"; do
+        for SHARD in "True" "False"; do
+            colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan colossalai --tp_degree $TP --placement $PLACEMENT --shardinit $SHARD || exit 1
+        done
+    done
+done
+
+# test zero1&2
+for DIST in "zero1" "zero2"; do
+    colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan $DIST || exit 1
+done

From 21c88220ce64203dab1a462e4c2894233242468d Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Sun, 15 Jan 2023 10:42:01 +0800
Subject: [PATCH 178/209] [zero] add unit test for low-level zero init (#2474)

---
 .../low_level_zero/test_zero_init.py          | 61 +++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 tests/test_zero/low_level_zero/test_zero_init.py

diff --git a/tests/test_zero/low_level_zero/test_zero_init.py b/tests/test_zero/low_level_zero/test_zero_init.py
new file mode 100644
index 000000000000..84d7b8c514b6
--- /dev/null
+++ b/tests/test_zero/low_level_zero/test_zero_init.py
@@ -0,0 +1,61 @@
+from functools import partial
+
+import pytest
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import torch.nn as nn
+
+import colossalai
+from colossalai.tensor import ProcessGroup
+from colossalai.utils import free_port, get_current_device
+from colossalai.utils.model.colo_init_context import ColoInitContext
+from colossalai.zero import LowLevelZeroOptimizer
+
+
+class TestModel(nn.Module):
+
+    def __init__(self):
+        super(TestModel, self).__init__()
+        self.linear1 = nn.Linear(128, 256)
+        self.linear2 = nn.Linear(256, 512)
+
+    def forward(self, x):
+        x = self.linear1(x)
+        x = self.linear2(x)
+        return x
+
+
+def exam_zero_init():
+    dp_2_tp_2_pg = ProcessGroup(dp_degree=2, tp_degree=2)
+    model1 = TestModel().cuda()
+    with ColoInitContext(device=get_current_device(), default_pg=dp_2_tp_2_pg):
+        model2 = TestModel()
+    optimizer1 = LowLevelZeroOptimizer(torch.optim.Adam(model1.parameters(), lr=1))
+    optimizer2 = LowLevelZeroOptimizer(torch.optim.Adam(model2.parameters(), lr=1))
+
+    assert optimizer1._local_rank == optimizer2._local_rank
+    assert optimizer1._world_size == optimizer2._world_size
+    assert optimizer1._dp_global_ranks == optimizer2._dp_global_ranks
+
+    mp_group1 = optimizer1._mp_torch_group
+    mp_group2 = optimizer2._mp_torch_group
+    assert dist.get_world_size(mp_group1) == dist.get_world_size(mp_group2)
+    assert dist.get_rank(mp_group1) == dist.get_rank(mp_group2)
+
+
+def run_dist(rank, world_size, port):
+    config_dict = dict(parallel=dict(data=2, tensor=dict(size=2, mode='1d')))
+    colossalai.launch(config=config_dict, rank=rank, world_size=world_size, port=port, host='localhost')
+    exam_zero_init()
+
+
+@pytest.mark.dist
+def test_zero_init():
+    world_size = 4
+    run_func = partial(run_dist, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_zero_init()

From 579dba572f77a28c14f610c0bff48aba31a685b8 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 16 Jan 2023 10:05:41 +0800
Subject: [PATCH 179/209] [workflow] fixed the skip condition of  example
 weekly check workflow (#2481)

---
 .github/workflows/auto_example_check.yml | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/auto_example_check.yml b/.github/workflows/auto_example_check.yml
index f88b6858e003..5e4022f7f0ea 100644
--- a/.github/workflows/auto_example_check.yml
+++ b/.github/workflows/auto_example_check.yml
@@ -81,9 +81,8 @@ jobs:
   # This is for all files' weekly check. Specifically, this job is to find all the directories.
   matrix_preparation:
     if: |
-        github.event.pull_request.draft == false &&
-        github.base_ref == 'main' &&
-        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'schedule'
+        github.repository == 'hpcaitech/ColossalAI' &&
+        github.event_name == 'schedule'
     name: Prepare matrix for weekly check
     runs-on: ubuntu-latest
     outputs:
@@ -101,9 +100,8 @@ jobs:
 
   weekly_check:
     if: |
-        github.event.pull_request.draft == false &&
-        github.base_ref == 'main' &&
-        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'schedule'
+        github.repository == 'hpcaitech/ColossalAI' &&
+        github.event_name == 'schedule'
     name: Weekly check all examples
     needs: matrix_preparation
     runs-on: [self-hosted, gpu]

From f78bad21ede6ce227cfe86d0ed46b8ce958667fd Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Mon, 16 Jan 2023 11:34:26 +0800
Subject: [PATCH 180/209] [example] stable diffusion add roadmap

---
 examples/images/diffusion/README.md | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/examples/images/diffusion/README.md b/examples/images/diffusion/README.md
index abb1d24c0262..725052bdb69d 100644
--- a/examples/images/diffusion/README.md
+++ b/examples/images/diffusion/README.md
@@ -26,6 +26,17 @@ Acceleration of AIGC (AI-Generated Content) models such as [Stable Diffusion v1]
 
 More details can be found in our [blog of Stable Diffusion v1](https://www.hpc-ai.tech/blog/diffusion-pretraining-and-hardware-fine-tuning-can-be-almost-7x-cheaper) and [blog of Stable Diffusion v2](https://www.hpc-ai.tech/blog/colossal-ai-0-2-0).
 
+
+## Roadmap
+This project is in rapid development.
+
+- [X] Train a stable diffusion model v1/v2 from scatch
+- [X] finetune a pretrained Stable diffusion v1 model
+- [X] Inference a pretrained model using PyTorch
+- [ ] finetune a pretrained Stable diffusion v2 model
+- [ ] Inference a pretrained model using TensoRT
+
+
 ## Installation
 
 ### Option #1: install from source
@@ -123,7 +134,7 @@ git clone https://huggingface.co/CompVis/stable-diffusion-v1-4
 
 ### stable-diffusion-v1-5 from runway
 
-If you want to useed the Last [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) wiegh from runwayml
+If you want to useed the Last [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) weight from runwayml
 
 ```
 git lfs install
@@ -156,7 +167,7 @@ You can change the trainging config in the yaml file
 - precision: the precision type used in training, default 16 (fp16), you must use fp16 if you want to apply colossalai
 - more information about the configuration of ColossalAIStrategy can be found [here](https://pytorch-lightning.readthedocs.io/en/latest/advanced/model_parallel.html#colossal-ai)
 
-## Finetune Example
+## Finetune Example (Work In Progress)
 ### Training on Teyvat Datasets
 
 We provide the finetuning example on [Teyvat](https://huggingface.co/datasets/Fazzie/Teyvat) dataset, which is create by BLIP generated captions.

From 9cba38b49257a748f782655f1c6a0d3935f0f8f6 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Mon, 16 Jan 2023 12:03:48 +0800
Subject: [PATCH 181/209] add dummy test_ci.sh

---
 examples/images/diffusion/README.md  | 5 ++---
 examples/images/diffusion/test_ci.sh | 0
 2 files changed, 2 insertions(+), 3 deletions(-)
 create mode 100644 examples/images/diffusion/test_ci.sh

diff --git a/examples/images/diffusion/README.md b/examples/images/diffusion/README.md
index 725052bdb69d..ddc7e2d97128 100644
--- a/examples/images/diffusion/README.md
+++ b/examples/images/diffusion/README.md
@@ -31,12 +31,11 @@ More details can be found in our [blog of Stable Diffusion v1](https://www.hpc-a
 This project is in rapid development.
 
 - [X] Train a stable diffusion model v1/v2 from scatch
-- [X] finetune a pretrained Stable diffusion v1 model
+- [X] Finetune a pretrained Stable diffusion v1 model
 - [X] Inference a pretrained model using PyTorch
-- [ ] finetune a pretrained Stable diffusion v2 model
+- [ ] Finetune a pretrained Stable diffusion v2 model
 - [ ] Inference a pretrained model using TensoRT
 
-
 ## Installation
 
 ### Option #1: install from source
diff --git a/examples/images/diffusion/test_ci.sh b/examples/images/diffusion/test_ci.sh
new file mode 100644
index 000000000000..e69de29bb2d1

From e4c38ba36728629423b26f2f382476cd9c3e65c4 Mon Sep 17 00:00:00 2001
From: Jiarui Fang <fangjiarui123@gmail.com>
Date: Mon, 16 Jan 2023 12:14:49 +0800
Subject: [PATCH 182/209] [example] stable diffusion add roadmap (#2482)

---
 examples/images/diffusion/README.md  | 14 ++++++++++++--
 examples/images/diffusion/test_ci.sh |  0
 2 files changed, 12 insertions(+), 2 deletions(-)
 create mode 100644 examples/images/diffusion/test_ci.sh

diff --git a/examples/images/diffusion/README.md b/examples/images/diffusion/README.md
index abb1d24c0262..ddc7e2d97128 100644
--- a/examples/images/diffusion/README.md
+++ b/examples/images/diffusion/README.md
@@ -26,6 +26,16 @@ Acceleration of AIGC (AI-Generated Content) models such as [Stable Diffusion v1]
 
 More details can be found in our [blog of Stable Diffusion v1](https://www.hpc-ai.tech/blog/diffusion-pretraining-and-hardware-fine-tuning-can-be-almost-7x-cheaper) and [blog of Stable Diffusion v2](https://www.hpc-ai.tech/blog/colossal-ai-0-2-0).
 
+
+## Roadmap
+This project is in rapid development.
+
+- [X] Train a stable diffusion model v1/v2 from scatch
+- [X] Finetune a pretrained Stable diffusion v1 model
+- [X] Inference a pretrained model using PyTorch
+- [ ] Finetune a pretrained Stable diffusion v2 model
+- [ ] Inference a pretrained model using TensoRT
+
 ## Installation
 
 ### Option #1: install from source
@@ -123,7 +133,7 @@ git clone https://huggingface.co/CompVis/stable-diffusion-v1-4
 
 ### stable-diffusion-v1-5 from runway
 
-If you want to useed the Last [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) wiegh from runwayml
+If you want to useed the Last [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) weight from runwayml
 
 ```
 git lfs install
@@ -156,7 +166,7 @@ You can change the trainging config in the yaml file
 - precision: the precision type used in training, default 16 (fp16), you must use fp16 if you want to apply colossalai
 - more information about the configuration of ColossalAIStrategy can be found [here](https://pytorch-lightning.readthedocs.io/en/latest/advanced/model_parallel.html#colossal-ai)
 
-## Finetune Example
+## Finetune Example (Work In Progress)
 ### Training on Teyvat Datasets
 
 We provide the finetuning example on [Teyvat](https://huggingface.co/datasets/Fazzie/Teyvat) dataset, which is create by BLIP generated captions.
diff --git a/examples/images/diffusion/test_ci.sh b/examples/images/diffusion/test_ci.sh
new file mode 100644
index 000000000000..e69de29bb2d1

From 7c317062277d6fbb082f6b9b051d8c7b30ce7cc4 Mon Sep 17 00:00:00 2001
From: Jiarui Fang <fangjiarui123@gmail.com>
Date: Mon, 16 Jan 2023 14:44:29 +0800
Subject: [PATCH 183/209] [CI] add test_ci.sh for palm, opt and gpt (#2475)

---
 examples/language/gpt/gemini/run_gemini.sh    |  3 +-
 examples/language/gpt/gemini/test_ci.sh       | 35 ++++++++++
 .../language/gpt/gemini/train_gpt_demo.py     | 11 +++-
 examples/language/gpt/test_ci.sh              | 17 +----
 examples/language/opt/test_ci.sh              |  4 ++
 examples/language/palm/run.sh                 |  2 +-
 examples/language/palm/test_ci.sh             |  9 +++
 examples/language/palm/train.py               | 64 +++++++++++++------
 8 files changed, 107 insertions(+), 38 deletions(-)
 create mode 100644 examples/language/gpt/gemini/test_ci.sh
 create mode 100644 examples/language/opt/test_ci.sh
 create mode 100644 examples/language/palm/test_ci.sh

diff --git a/examples/language/gpt/gemini/run_gemini.sh b/examples/language/gpt/gemini/run_gemini.sh
index 0c2ea660f1e0..6f0710d54f01 100644
--- a/examples/language/gpt/gemini/run_gemini.sh
+++ b/examples/language/gpt/gemini/run_gemini.sh
@@ -9,7 +9,7 @@ export PLACEMENT=${PLACEMENT:-"cpu"}
 export USE_SHARD_INIT=${USE_SHARD_INIT:-False}
 export BATCH_SIZE=${BATCH_SIZE:-16}
 export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"}
-
+export TRAIN_STEP=${TRAIN_STEP:-10}
 # export PYTHONPATH=$PWD:$PYTHONPATH
 
 mkdir -p gemini_logs
@@ -21,4 +21,5 @@ torchrun --standalone --nproc_per_node=${GPUNUM} ./train_gpt_demo.py \
 --placement=${PLACEMENT} \
 --shardinit=${USE_SHARD_INIT} \
 --distplan=${DISTPLAN} \
+--train_step=${TRAIN_STEP} \
 2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log
diff --git a/examples/language/gpt/gemini/test_ci.sh b/examples/language/gpt/gemini/test_ci.sh
new file mode 100644
index 000000000000..6079d5ed615b
--- /dev/null
+++ b/examples/language/gpt/gemini/test_ci.sh
@@ -0,0 +1,35 @@
+set -x
+$(cd `dirname $0`;pwd)
+export TRAIN_STEP=4
+
+for MODEL_TYPE in "gpt2_medium"; do
+  for DISTPLAN in "colossalai"; do
+    for BATCH_SIZE in 2; do
+      for GPUNUM in 1 4; do
+        for TPDEGREE in 1 2; do
+          if [ ${TPDEGREE} -gt ${GPUNUM} ]; then
+            continue
+          fi
+          for PLACEMENT in "cpu" "auto"; do
+            MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \
+            bash ./run_gemini.sh
+          done
+        done
+      done
+    done
+  done
+
+  for DISTPLAN in "zero1" "zero2"; do
+    for BATCH_SIZE in 2; do
+      for GPUNUM in 1 4; do
+        for TPDEGREE in 1; do
+          if [ ${TPDEGREE} -gt ${GPUNUM} ]; then
+            continue
+          fi
+            MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE}\
+            bash ./run_gemini.sh
+          done
+        done
+      done
+    done
+done
diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py
index f77be12d2d05..713de6f9fb45 100644
--- a/examples/language/gpt/gemini/train_gpt_demo.py
+++ b/examples/language/gpt/gemini/train_gpt_demo.py
@@ -65,7 +65,13 @@ def parse_args():
         default="gpt2_medium",
         help="model model scale",
     )
-    parser.add_argument("--steps", type=int, default=10, help="num of training steps")
+    parser.add_argument(
+        "--train_step",
+        type=int,
+        default=10,
+        help="training iterations for test",
+    )
+
     args = parser.parse_args()
     return args
 
@@ -237,7 +243,8 @@ def main():
     SEQ_LEN = 1024
     VOCAB_SIZE = 50257
 
-    NUM_STEPS = args.steps
+    NUM_STEPS = args.train_step
+
     WARMUP_STEPS = 1
     assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps"
     assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median "
diff --git a/examples/language/gpt/test_ci.sh b/examples/language/gpt/test_ci.sh
index d04ece182016..d67c17229e71 100644
--- a/examples/language/gpt/test_ci.sh
+++ b/examples/language/gpt/test_ci.sh
@@ -1,15 +1,2 @@
-pip install -r requirements.txt
-
-# test colossalai
-for TP in 1 2; do
-    for PLACEMENT in "cpu" "cuda" "auto" "const"; do
-        for SHARD in "True" "False"; do
-            colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan colossalai --tp_degree $TP --placement $PLACEMENT --shardinit $SHARD || exit 1
-        done
-    done
-done
-
-# test zero1&2
-for DIST in "zero1" "zero2"; do
-    colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan $DIST || exit 1
-done
+set -x
+cd gemini && bash test_ci.sh
diff --git a/examples/language/opt/test_ci.sh b/examples/language/opt/test_ci.sh
new file mode 100644
index 000000000000..317f602cda3c
--- /dev/null
+++ b/examples/language/opt/test_ci.sh
@@ -0,0 +1,4 @@
+for GPUNUM in 2 1
+do
+env BS=2 MODEL="125m" GPUNUM=$GPUNUM bash ./run_gemini.sh
+done
diff --git a/examples/language/palm/run.sh b/examples/language/palm/run.sh
index 4aa868953f7b..7a533509e009 100644
--- a/examples/language/palm/run.sh
+++ b/examples/language/palm/run.sh
@@ -8,4 +8,4 @@ export PLACEMENT='cpu'
 export USE_SHARD_INIT=False
 export BATCH_SIZE=4
 
-env OMP_NUM_THREADS=12 torchrun  --standalone --nproc_per_node=${GPUNUM}  --master_port 29501  train_new.py  --tp_degree=${TPDEGREE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee run.log
\ No newline at end of file
+env OMP_NUM_THREADS=12 torchrun  --standalone --nproc_per_node=${GPUNUM}  --master_port 29501  train.py  --tp_degree=${TPDEGREE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee run.log
diff --git a/examples/language/palm/test_ci.sh b/examples/language/palm/test_ci.sh
new file mode 100644
index 000000000000..f21095578077
--- /dev/null
+++ b/examples/language/palm/test_ci.sh
@@ -0,0 +1,9 @@
+$(cd `dirname $0`;pwd)
+
+for BATCH_SIZE in 2
+do
+for GPUNUM in 1 4
+do
+env OMP_NUM_THREADS=12 torchrun  --standalone --nproc_per_node=${GPUNUM}  --master_port 29501  train.py --dummy_data=True --batch_size=${BATCH_SIZE}  2>&1 | tee run.log
+done
+done
diff --git a/examples/language/palm/train.py b/examples/language/palm/train.py
index 6725c07dfac7..a334ea9511fb 100644
--- a/examples/language/palm/train.py
+++ b/examples/language/palm/train.py
@@ -1,11 +1,12 @@
 import gzip
 import random
-from time import time
 from functools import partial
+from time import time
+
 import numpy as np
 import torch
-import torch.optim as optim
 import torch.nn as nn
+import torch.optim as optim
 import tqdm
 from packaging import version
 from palm_pytorch import PaLM
@@ -23,7 +24,7 @@
 
 # constants
 
-NUM_BATCHES = int(100)
+NUM_BATCHES = int(10)
 WARMUP_BATCHES = 1
 GRADIENT_ACCUMULATE_EVERY = 1
 LEARNING_RATE = 2e-4
@@ -66,9 +67,16 @@ def parse_args():
         default=8,
         help="batch size per DP group of training.",
     )
+    parser.add_argument(
+        "--dummy_data",
+        type=bool,
+        default=False,
+        help="use dummy dataset.",
+    )
     args = parser.parse_args()
     return args
 
+
 # helpers
 def cycle(loader):
     while True:
@@ -79,12 +87,15 @@ def cycle(loader):
 def decode_token(token):
     return str(chr(max(32, token)))
 
+
 def get_tflops(model_numel, batch_size, seq_len, step_time):
     return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12)
 
+
 def decode_tokens(tokens):
     return "".join(list(map(decode_token, tokens)))
 
+
 def get_model_size(model: nn.Module):
     total_numel = 0
     for module in model.modules():
@@ -92,6 +103,7 @@ def get_model_size(model: nn.Module):
             total_numel += p.numel()
     return total_numel
 
+
 # Gemini + ZeRO DDP
 def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"):
     cai_version = colossalai.__version__
@@ -115,6 +127,7 @@ def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy:
         raise NotImplemented(f"CAI version {cai_version} is not supported")
     return model
 
+
 ## Parameter Sharding Strategies for Tensor Parallelism
 def split_param_single_dim_tp1d(dim: int, param: ColoParameter, pg: ProcessGroup):
     spec = (ShardSpec([dim], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
@@ -128,6 +141,7 @@ def split_param_row_tp1d(param: ColoParameter, pg: ProcessGroup):
 def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup):
     split_param_single_dim_tp1d(-1, param, pg)
 
+
 # Tensor Parallel
 def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
     """tensor_parallelize
@@ -159,15 +173,28 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
 
 args = parse_args()
 if args.distplan not in ["colossalai", "pytorch"]:
-        raise TypeError(f"{args.distplan} is error")
+    raise TypeError(f"{args.distplan} is error")
 disable_existing_loggers()
 colossalai.launch_from_torch(config={})
 logger = get_dist_logger()
 
-with gzip.open("./data/enwik8.gz") as file:
-    X = np.fromstring(file.read(int(95e6)), dtype=np.uint8)
-    trX, vaX = np.split(X, [int(90e6)])
-    data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX)
+
+def generate_dataset(dummy_data: bool = False):
+    if not dummy_data:
+        with gzip.open("./data/enwik8.gz") as file:
+            X = np.fromstring(file.read(int(95e6)), dtype=np.uint8)
+            trX, vaX = np.split(X, [int(90e6)])
+            data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX)
+            # print(f"data_train {data_train.shape} {data_train.dtype} {max(data_train)} {min(data_train)}")
+            # print(f"data_val {data_val.shape} {data_val.dtype}  {max(data_val)} {min(data_val)}")
+            return data_train, data_val
+    else:
+        return torch.randint(0, 100, (90000000,)), torch.randint(0, 100, (5000000,))
+
+
+data_train, data_val = generate_dataset(args.dummy_data)
+
+print("generate dataset ready!")
 
 
 class TextSamplerDataset(Dataset):
@@ -216,7 +243,7 @@ def __len__(self):
     model.cuda()
     optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
 
- # model is shared after TP
+# model is shared after TP
 numel = get_model_size(model)
 get_tflops_func = partial(get_tflops, numel, args.batch_size, SEQ_LEN)
 
@@ -251,7 +278,7 @@ def __len__(self):
         )
         if i >= WARMUP_BATCHES:
             tflops_list.append(step_tflops)
-    
+
     else:
         for __ in range(GRADIENT_ACCUMULATE_EVERY):
             loss = model(next(train_loader))
@@ -261,18 +288,17 @@ def __len__(self):
         torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
         optim.step()
         optim.zero_grad()
-    
+
 tflops_list.sort()
 median_index = ((NUM_BATCHES - WARMUP_BATCHES) >> 1) + WARMUP_BATCHES
 logger.info(f"Median TFLOPS is {tflops_list[median_index]:.3f}")
 
-
-    # TODO
-    # if i % VALIDATE_EVERY == 0:
-    #     model.eval()
-    #     with torch.no_grad():
-    #         loss = model(next(val_loader))
-    #         print(f"validation loss: {loss.item()}")
+# TODO
+# if i % VALIDATE_EVERY == 0:
+#     model.eval()
+#     with torch.no_grad():
+#         loss = model(next(val_loader))
+#         print(f"validation loss: {loss.item()}")
 
     # if i % GENERATE_EVERY == 0:
     #     model.eval()
@@ -282,4 +308,4 @@ def __len__(self):
 
     #     sample = model.generate(inp[None, ...], GENERATE_LENGTH)
     #     output_str = decode_tokens(sample[0])
-    #     print(output_str)
\ No newline at end of file
+    #     print(output_str)

From e64a05b38b37436f9fa6872373260b455b3e8645 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Mon, 16 Jan 2023 14:45:06 +0800
Subject: [PATCH 184/209] polish code

---
 examples/language/palm/train.py | 38 +++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/examples/language/palm/train.py b/examples/language/palm/train.py
index 6725c07dfac7..b17496954f06 100644
--- a/examples/language/palm/train.py
+++ b/examples/language/palm/train.py
@@ -1,22 +1,22 @@
 import gzip
 import random
-from time import time
 from functools import partial
+from time import time
+
 import numpy as np
 import torch
-import torch.optim as optim
 import torch.nn as nn
+import torch.optim as optim
 import tqdm
 from packaging import version
 from palm_pytorch import PaLM
 from palm_pytorch.autoregressive_wrapper import AutoregressiveWrapper
-from torch.nn import functional as F
 from torch.utils.data import DataLoader, Dataset
 
 import colossalai
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.nn.optimizer.gemini_optimizer import GeminiAdamOptimizer
-from colossalai.nn.parallel import GeminiDDP, ZeroDDP
+from colossalai.nn.parallel import ZeroDDP
 from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
 from colossalai.utils import MultiTimer, get_current_device
 from colossalai.utils.model.colo_init_context import ColoInitContext
@@ -69,6 +69,7 @@ def parse_args():
     args = parser.parse_args()
     return args
 
+
 # helpers
 def cycle(loader):
     while True:
@@ -79,12 +80,15 @@ def cycle(loader):
 def decode_token(token):
     return str(chr(max(32, token)))
 
+
 def get_tflops(model_numel, batch_size, seq_len, step_time):
     return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12)
 
+
 def decode_tokens(tokens):
     return "".join(list(map(decode_token, tokens)))
 
+
 def get_model_size(model: nn.Module):
     total_numel = 0
     for module in model.modules():
@@ -92,6 +96,7 @@ def get_model_size(model: nn.Module):
             total_numel += p.numel()
     return total_numel
 
+
 # Gemini + ZeRO DDP
 def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"):
     cai_version = colossalai.__version__
@@ -115,6 +120,7 @@ def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy:
         raise NotImplemented(f"CAI version {cai_version} is not supported")
     return model
 
+
 ## Parameter Sharding Strategies for Tensor Parallelism
 def split_param_single_dim_tp1d(dim: int, param: ColoParameter, pg: ProcessGroup):
     spec = (ShardSpec([dim], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
@@ -128,6 +134,7 @@ def split_param_row_tp1d(param: ColoParameter, pg: ProcessGroup):
 def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup):
     split_param_single_dim_tp1d(-1, param, pg)
 
+
 # Tensor Parallel
 def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
     """tensor_parallelize
@@ -159,7 +166,7 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
 
 args = parse_args()
 if args.distplan not in ["colossalai", "pytorch"]:
-        raise TypeError(f"{args.distplan} is error")
+    raise TypeError(f"{args.distplan} is error")
 disable_existing_loggers()
 colossalai.launch_from_torch(config={})
 logger = get_dist_logger()
@@ -216,7 +223,7 @@ def __len__(self):
     model.cuda()
     optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
 
- # model is shared after TP
+# model is shared after TP
 numel = get_model_size(model)
 get_tflops_func = partial(get_tflops, numel, args.batch_size, SEQ_LEN)
 
@@ -251,7 +258,7 @@ def __len__(self):
         )
         if i >= WARMUP_BATCHES:
             tflops_list.append(step_tflops)
-    
+
     else:
         for __ in range(GRADIENT_ACCUMULATE_EVERY):
             loss = model(next(train_loader))
@@ -261,18 +268,17 @@ def __len__(self):
         torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
         optim.step()
         optim.zero_grad()
-    
+
 tflops_list.sort()
 median_index = ((NUM_BATCHES - WARMUP_BATCHES) >> 1) + WARMUP_BATCHES
 logger.info(f"Median TFLOPS is {tflops_list[median_index]:.3f}")
 
-
-    # TODO
-    # if i % VALIDATE_EVERY == 0:
-    #     model.eval()
-    #     with torch.no_grad():
-    #         loss = model(next(val_loader))
-    #         print(f"validation loss: {loss.item()}")
+# TODO
+# if i % VALIDATE_EVERY == 0:
+#     model.eval()
+#     with torch.no_grad():
+#         loss = model(next(val_loader))
+#         print(f"validation loss: {loss.item()}")
 
     # if i % GENERATE_EVERY == 0:
     #     model.eval()
@@ -282,4 +288,4 @@ def __len__(self):
 
     #     sample = model.generate(inp[None, ...], GENERATE_LENGTH)
     #     output_str = decode_tokens(sample[0])
-    #     print(output_str)
\ No newline at end of file
+    #     print(output_str)

From 37baea20cb6e2cd35f0364bb6604950706c83ce4 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Mon, 16 Jan 2023 14:59:25 +0800
Subject: [PATCH 185/209] [example] titans for gpt

---
 examples/language/gpt/titans/LICENSE          | 201 ++++++
 examples/language/gpt/titans/README.md        |  48 ++
 .../titans/configs/gpt2_small_zero3_pp1d.py   |  31 +
 .../gpt/titans/configs/gpt3_zero3_pp1d.py     |  31 +
 .../language/gpt/titans/model/__init__.py     |   3 +
 examples/language/gpt/titans/model/embed.py   | 599 ++++++++++++++++++
 examples/language/gpt/titans/model/gpt1d.py   | 349 ++++++++++
 .../gpt/titans/model/pipeline_gpt1d.py        | 322 ++++++++++
 examples/language/gpt/titans/requirements.txt |   4 +
 examples/language/gpt/titans/run.sh           |   2 +
 examples/language/gpt/titans/test_ci.sh       |   1 +
 examples/language/gpt/titans/train_gpt.py     | 148 +++++
 12 files changed, 1739 insertions(+)
 create mode 100644 examples/language/gpt/titans/LICENSE
 create mode 100644 examples/language/gpt/titans/README.md
 create mode 100644 examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py
 create mode 100644 examples/language/gpt/titans/configs/gpt3_zero3_pp1d.py
 create mode 100644 examples/language/gpt/titans/model/__init__.py
 create mode 100644 examples/language/gpt/titans/model/embed.py
 create mode 100644 examples/language/gpt/titans/model/gpt1d.py
 create mode 100644 examples/language/gpt/titans/model/pipeline_gpt1d.py
 create mode 100644 examples/language/gpt/titans/requirements.txt
 create mode 100644 examples/language/gpt/titans/run.sh
 create mode 100644 examples/language/gpt/titans/test_ci.sh
 create mode 100644 examples/language/gpt/titans/train_gpt.py

diff --git a/examples/language/gpt/titans/LICENSE b/examples/language/gpt/titans/LICENSE
new file mode 100644
index 000000000000..261eeb9e9f8b
--- /dev/null
+++ b/examples/language/gpt/titans/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/examples/language/gpt/titans/README.md b/examples/language/gpt/titans/README.md
new file mode 100644
index 000000000000..14c07442b82a
--- /dev/null
+++ b/examples/language/gpt/titans/README.md
@@ -0,0 +1,48 @@
+# Run GPT With Colossal-AI
+
+## How to Prepare Webtext Dataset
+
+You can download the preprocessed sample dataset for this demo via our [Google Drive sharing link](https://drive.google.com/file/d/1QKI6k-e2gJ7XgS8yIpgPPiMmwiBP_BPE/view?usp=sharing).
+
+
+You can also avoid dataset preparation by using `--use_dummy_data` during running.
+
+## Run this Demo
+
+Use the following commands to install prerequisites.
+
+```bash
+# assuming using cuda 11.3
+conda install pytorch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0 cudatoolkit=11.3 -c pytorch
+pip install colossalai==0.1.9+torch1.11cu11.3 -f https://release.colossalai.org
+```
+
+Use the following commands to execute training.
+
+```Bash
+#!/usr/bin/env sh
+export DATA=/path/to/small-gpt-dataset.json'
+
+# run on a single node
+colossalai run --nproc_per_node=<num_gpus> train_gpt.py --config configs/<config_file> --from_torch
+
+# run on multiple nodes with slurm
+colossalai run --nproc_per_node=<num_gpus> \
+   --master_addr <hostname> \
+   --master_port <port-number> \
+   --hosts <list-of-hostname-separated-by-comma> \
+   train_gpt.py \
+   --config configs/<config_file> \
+   --from_torch \
+   --use_dummy_data
+
+# run on multiple nodes with slurm
+srun python \
+   train_gpt.py \
+   --config configs/<config_file> \
+   --host <master_node> \
+   --use_dummy_data
+
+```
+
+You can set the `<config_file>` to any file in the `configs` folder. To simply get it running, you can start with `gpt_small_zero3_pp1d.py` on a single node first. You can view the explanations in the config file regarding how to change the parallel setting.
diff --git a/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py b/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py
new file mode 100644
index 000000000000..8ef81cb0a14f
--- /dev/null
+++ b/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py
@@ -0,0 +1,31 @@
+from model import GPT2_small_pipeline_hybrid
+
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.zero.shard_utils import TensorShardStrategy
+
+BATCH_SIZE = 8
+NUM_EPOCHS = 10
+SEQ_LEN = 1024
+NUM_MICRO_BATCHES = 4
+HIDDEN_SIZE = 768
+TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LEN, HIDDEN_SIZE)
+
+# if you do no want zero, just comment out this dictionary
+zero = dict(model_config=dict(tensor_placement_policy='cuda', shard_strategy=TensorShardStrategy()),
+            optimizer_config=dict(initial_scale=2**16))
+
+optimizer = dict(
+    type=HybridAdam,
+    lr=0.00015,
+    weight_decay=1e-2,
+)
+
+model = dict(type=GPT2_small_pipeline_hybrid, checkpoint=True, num_chunks=1)
+
+# pipeline parallel: modify integer value for the number of pipeline stages
+# tensor parallel: modify size to set the tensor parallel size, usually the number of GPUs per node
+# for the current model implementation, mode can only be 1D or None
+parallel = dict(
+    pipeline=1,
+    tensor=dict(size=2, mode='1d'),
+)
diff --git a/examples/language/gpt/titans/configs/gpt3_zero3_pp1d.py b/examples/language/gpt/titans/configs/gpt3_zero3_pp1d.py
new file mode 100644
index 000000000000..9f9816b3004f
--- /dev/null
+++ b/examples/language/gpt/titans/configs/gpt3_zero3_pp1d.py
@@ -0,0 +1,31 @@
+from model import GPT3_pipeline_hybrid
+
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.zero.shard_utils import TensorShardStrategy
+
+BATCH_SIZE = 192
+NUM_EPOCHS = 60
+SEQ_LEN = 2048
+NUM_MICRO_BATCHES = 192
+HIDDEN_SIZE = 12288
+TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LEN, HIDDEN_SIZE)
+
+# if you do no want zero, just comment out this dictionary
+zero = dict(model_config=dict(tensor_placement_policy='cuda', shard_strategy=TensorShardStrategy()),
+            optimizer_config=dict(initial_scale=2**16))
+
+optimizer = dict(
+    type=HybridAdam,
+    lr=0.00015,
+    weight_decay=1e-2,
+)
+
+model = dict(type=GPT3_pipeline_hybrid, checkpoint=True, num_chunks=1)
+
+# pipeline parallel: modify integer value for the number of pipeline stages
+# tensor parallel: modify size to set the tensor parallel size, usually the number of GPUs per node
+# for the current model implementation, mode can only be 1D or None
+parallel = dict(
+    pipeline=1,
+    tensor=dict(size=2, mode='1d'),    # for the current model implementation, mode can only be 1D or None
+)
diff --git a/examples/language/gpt/titans/model/__init__.py b/examples/language/gpt/titans/model/__init__.py
new file mode 100644
index 000000000000..eec48ef893fb
--- /dev/null
+++ b/examples/language/gpt/titans/model/__init__.py
@@ -0,0 +1,3 @@
+from .embed import vocab_parallel_cross_entropy
+from .gpt1d import *
+from .pipeline_gpt1d import *
diff --git a/examples/language/gpt/titans/model/embed.py b/examples/language/gpt/titans/model/embed.py
new file mode 100644
index 000000000000..6369b9f8c5a1
--- /dev/null
+++ b/examples/language/gpt/titans/model/embed.py
@@ -0,0 +1,599 @@
+import torch
+import torch.nn.init as init
+from torch import Tensor
+from torch import distributed as dist
+from torch import nn as nn
+from torch.nn import functional as F
+from torch.nn.parameter import Parameter
+
+from colossalai.context import ParallelMode, seed
+from colossalai.core import global_context as gpc
+from colossalai.nn.layer.base_layer import ParallelLayer
+from colossalai.nn.layer.parallel_1d._utils import gather_forward_split_backward, reduce_grad, reduce_input
+from colossalai.nn.layer.parallel_1d.layers import Linear1D_Row
+from colossalai.nn.layer.utils import divide
+from colossalai.registry import LAYERS, LOSSES, MODELS
+from colossalai.utils import get_current_device
+
+
+class VocabParallelEmbedding(torch.nn.Module):
+    """Language model embeddings.
+
+    Arguments:
+        hidden_size: hidden size
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+        embedding_dropout_prob: dropout probability for embeddings
+        init_method: weight initialization method
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 vocab_size,
+                 max_sequence_length,
+                 embedding_dropout_prob,
+                 num_tokentypes=0,
+                 dtype=torch.float):
+        super(VocabParallelEmbedding, self).__init__()
+
+        self.hidden_size = hidden_size
+        self.num_tokentypes = num_tokentypes
+
+        # Word embeddings (parallel).
+        self.word_embeddings = VocabParallelEmbedding1D(vocab_size, self.hidden_size, dtype=dtype)
+        self._word_embeddings_key = 'word_embeddings'
+
+        # Position embedding (serial).
+        self.position_embeddings = torch.nn.Embedding(max_sequence_length, self.hidden_size, dtype=dtype)
+        self._position_embeddings_key = 'position_embeddings'
+        # Initialize the position embeddings.
+        # self.init_method(self.position_embeddings.weight)
+
+        # Token type embedding.
+        # Add this as an optional field that can be added through
+        # method call so we can load a pretrain model without
+        # token types and add them as needed.
+        self._tokentype_embeddings_key = 'tokentype_embeddings'
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes, self.hidden_size, dtype=dtype)
+            # Initialize the token-type embeddings.
+            # self.init_method(self.tokentype_embeddings.weight)
+        else:
+            self.tokentype_embeddings = None
+
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+
+    def zero_parameters(self):
+        """Zero out all parameters in embedding."""
+        self.word_embeddings.weight.data.fill_(0)
+        self.word_embeddings.weight.shared = True
+        self.position_embeddings.weight.data.fill_(0)
+        self.position_embeddings.weight.shared = True
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings.weight.data.fill_(0)
+            self.tokentype_embeddings.weight.shared = True
+
+    def add_tokentype_embeddings(self, num_tokentypes):
+        """Add token-type embedding. This function is provided so we can add
+        token-type embeddings in case the pretrained model does not have it.
+        This allows us to load the model normally and then add this embedding.
+        """
+        if self.tokentype_embeddings is not None:
+            raise Exception('tokentype embeddings is already initialized')
+        if torch.distributed.get_rank() == 0:
+            print('adding embedding for {} tokentypes'.format(num_tokentypes), flush=True)
+        self.num_tokentypes = num_tokentypes
+        self.tokentype_embeddings = torch.nn.Embedding(num_tokentypes, self.hidden_size)
+        # Initialize the token-type embeddings.
+        # self.init_method(self.tokentype_embeddings.weight)
+
+    def forward(self, input_ids, position_ids=None, tokentype_ids=None):
+        # Embeddings.
+        if input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        words_embeddings = self.word_embeddings(input_ids)
+
+        if position_ids is not None:
+            position_ids = position_ids.view(-1, input_shape[-1])
+        if position_ids is None:
+            position_ids = torch.arange(0, input_shape[-1] + 0, dtype=torch.long, device=get_current_device())
+            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+        position_embeddings = self.position_embeddings(position_ids)
+
+        embeddings = words_embeddings + position_embeddings
+
+        # Dropout.
+        with seed(ParallelMode.TENSOR):
+            embeddings = self.embedding_dropout(embeddings)
+        return embeddings
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='', keep_vars=False):
+        """For easy load."""
+
+        state_dict_ = {}
+        state_dict_[self._word_embeddings_key] \
+            = self.word_embeddings.state_dict(destination, prefix, keep_vars)
+        state_dict_[self._position_embeddings_key] \
+            = self.position_embeddings.state_dict(
+                destination, prefix, keep_vars)
+        if self.num_tokentypes > 0:
+            state_dict_[self._tokentype_embeddings_key] \
+                = self.tokentype_embeddings.state_dict(
+                    destination, prefix, keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        # Word embedding.
+        if self._word_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._word_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'word_embeddings' in key:
+                    state_dict_[key.split('word_embeddings.')[1]] \
+                        = state_dict[key]
+        self.word_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Position embedding.
+        if self._position_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._position_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'position_embeddings' in key:
+                    state_dict_[key.split('position_embeddings.')[1]] \
+                        = state_dict[key]
+        self.position_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Tokentype embedding.
+        if self.num_tokentypes > 0:
+            state_dict_ = {}
+            if self._tokentype_embeddings_key in state_dict:
+                state_dict_ = state_dict[self._tokentype_embeddings_key]
+            else:
+                # for backward compatibility.
+                for key in state_dict.keys():
+                    if 'tokentype_embeddings' in key:
+                        state_dict_[key.split('tokentype_embeddings.')[1]] \
+                            = state_dict[key]
+            if len(state_dict_.keys()) > 0:
+                self.tokentype_embeddings.load_state_dict(state_dict_, strict=strict)
+            else:
+                print('***WARNING*** expected tokentype embeddings in the '
+                      'checkpoint but could not find it',
+                      flush=True)
+
+
+class VocabParallelEmbedding1D(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        init_method: method to initialize weights.
+    """
+
+    def __init__(self, num_embeddings, embedding_dim, dtype=None, init_method=None):
+        super(VocabParallelEmbedding1D, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        # Set the details for compatibility.
+        self.padding_idx = None
+        self.max_norm = None
+        self.norm_type = 2.
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+        self.tensor_model_parallel_size = gpc.tensor_parallel_size
+        # Divide the weight matrix along the vocabulary dimension.
+        self.vocab_start_index, self.vocab_end_index = \
+            VocabUtility.vocab_range_from_global_vocab_size(
+                self.num_embeddings, gpc.get_local_rank(ParallelMode.PARALLEL_1D),
+                self.tensor_model_parallel_size)
+        self.num_embeddings_per_partition = self.vocab_end_index - \
+            self.vocab_start_index
+
+        # Allocate weights and initialize.
+        factory_kwargs = {'device': get_current_device(), 'dtype': dtype}
+        self.weight = Parameter(torch.empty(self.num_embeddings_per_partition, self.embedding_dim, **factory_kwargs))
+        init.uniform_(self.weight, -1, 1)
+
+    def forward(self, input_):
+        if self.tensor_model_parallel_size > 1:
+            # Build the mask.
+            input_mask = (input_ < self.vocab_start_index) | \
+                         (input_ >= self.vocab_end_index)
+            # Mask the input.
+            masked_input = input_.clone() - self.vocab_start_index
+            masked_input[input_mask] = 0
+        else:
+            masked_input = input_
+            # Get the embeddings.
+        output_parallel = F.embedding(masked_input, self.weight, self.padding_idx, self.max_norm, self.norm_type,
+                                      self.scale_grad_by_freq, self.sparse)
+        # Mask the output embedding.
+        if self.tensor_model_parallel_size > 1:
+            output_parallel[input_mask, :] = 0.0
+        # Reduce across all the model parallel GPUs.
+        output = output = reduce_input(output_parallel, ParallelMode.PARALLEL_1D)
+        return output
+
+
+@LOSSES.register_module
+class vocab_parallel_cross_entropy(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, vocab_parallel_logits, target):
+        """Helper function for the cross entropy."""
+        vocab_parallel_logits = vocab_parallel_logits[..., :-1, :].contiguous()
+        target = target[..., 1:].contiguous()
+        return _VocabParallelCrossEntropy.apply(vocab_parallel_logits.view(-1, vocab_parallel_logits.size(-1)),
+                                                target.view(-1))
+
+
+class _VocabParallelCrossEntropy(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, vocab_parallel_logits, target):
+
+        # Maximum value along vocab dimension across all GPUs.
+        logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
+        torch.distributed.all_reduce(logits_max,
+                                     op=torch.distributed.ReduceOp.MAX,
+                                     group=gpc.get_group(ParallelMode.PARALLEL_1D))
+        # Subtract the maximum value.
+        vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
+
+        # Get the partition's vocab indices
+        get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
+        partition_vocab_size = vocab_parallel_logits.size()[-1]
+        rank = gpc.get_local_rank(ParallelMode.PARALLEL_1D)
+        world_size = gpc.tensor_parallel_size
+        vocab_start_index, vocab_end_index = get_vocab_range(partition_vocab_size, rank, world_size)
+
+        # Create a mask of valid vocab ids (1 means it needs to be masked).
+        target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
+        masked_target = target.clone() - vocab_start_index
+        masked_target[target_mask] = 0
+
+        # Get predicted-logits = logits[target].
+        # For Simplicity, we convert logits to a 2-D tensor with size
+        # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
+        logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size)
+        masked_target_1d = masked_target.view(-1)
+        arange_1d = torch.arange(start=0, end=logits_2d.size()[0], device=logits_2d.device)
+        predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
+        predicted_logits_1d = predicted_logits_1d.clone().contiguous()
+        predicted_logits = predicted_logits_1d.view_as(target)
+        predicted_logits[target_mask] = 0.0
+        # All reduce is needed to get the chunks from other GPUs.
+        torch.distributed.all_reduce(predicted_logits,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=gpc.get_group(ParallelMode.PARALLEL_1D))
+
+        # Sum of exponential of logits along vocab dimension across all GPUs.
+        exp_logits = vocab_parallel_logits
+        torch.exp(vocab_parallel_logits, out=exp_logits)
+        sum_exp_logits = exp_logits.sum(dim=-1)
+        torch.distributed.all_reduce(sum_exp_logits,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=gpc.get_group(ParallelMode.PARALLEL_1D))
+
+        # Loss = log(sum(exp(logits))) - predicted-logit.
+        loss = torch.log(sum_exp_logits) - predicted_logits
+        loss = loss.mean()
+        # Store softmax, target-mask and masked-target for backward pass.
+        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
+        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
+        return loss
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        # Retreive tensors from the forward path.
+        softmax, target_mask, masked_target_1d = ctx.saved_tensors
+
+        # All the inputs have softmax as their gradient.
+        grad_input = softmax
+        # For simplicity, work with the 2D gradient.
+        partition_vocab_size = softmax.size()[-1]
+        grad_2d = grad_input.view(-1, partition_vocab_size)
+
+        # Add the gradient from matching classes.
+        arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
+        grad_2d[arange_1d, masked_target_1d] -= (1.0 - target_mask.view(-1).float())
+
+        # Finally elementwise multiplication with the output gradients.
+        grad_input.mul_(grad_output.unsqueeze(dim=-1))
+
+        return grad_input, None
+
+
+class VocabUtility:
+    """Split the vocabulary into `world_size` chunks amd return the
+        first and last index of the vocabulary belonging to the `rank`
+        partition: Note that indices in [fist, last)"""
+
+    @staticmethod
+    def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank, world_size):
+        index_f = rank * per_partition_vocab_size
+        index_l = index_f + per_partition_vocab_size
+        return index_f, index_l
+
+    @staticmethod
+    def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
+        per_partition_vocab_size = divide(global_vocab_size, world_size)
+        return VocabUtility.vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank, world_size)
+
+
+class VocabParallelGPTLMHead1D(ParallelLayer):
+    """
+    Language model head that shares the same parameters with the embedding matrix.
+    """
+
+    def __init__(self, embed=None, vocab_size=None, dtype=None, embed_dim=None):
+        super().__init__()
+        if embed is not None:
+            self.head = embed
+        else:
+            self.head = VocabParallelEmbedding1D(vocab_size, embed_dim, dtype=dtype)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = reduce_grad(x, ParallelMode.PARALLEL_1D)
+        x = F.linear(x, self.head.weight)
+        return x
+
+
+###################################
+
+
+class HiddenParallelEmbedding(torch.nn.Module):
+    """Language model embeddings.
+
+    Arguments:
+        hidden_size: hidden size
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+        embedding_dropout_prob: dropout probability for embeddings
+        init_method: weight initialization method
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        vocab_size,
+        max_sequence_length,
+        embedding_dropout_prob,
+        dtype=torch.float,
+        padding_idx: int = 0,
+        num_tokentypes=0,
+    ):
+        super(HiddenParallelEmbedding, self).__init__()
+
+        self.hidden_size = hidden_size
+        self.num_tokentypes = num_tokentypes
+
+        # Word embeddings (parallel).
+        self.word_embeddings = HiddenParallelEmbedding1D(vocab_size, hidden_size, dtype, padding_idx)
+        self._word_embeddings_key = 'word_embeddings'
+
+        # Position embedding (serial).
+        self.position_embeddings = torch.nn.Embedding(max_sequence_length, self.hidden_size)
+        self._position_embeddings_key = 'position_embeddings'
+        # Initialize the position embeddings.
+        # self.init_method(self.position_embeddings.weight)
+
+        # Token type embedding.
+        # Add this as an optional field that can be added through
+        # method call so we can load a pretrain model without
+        # token types and add them as needed.
+        self._tokentype_embeddings_key = 'tokentype_embeddings'
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes, self.hidden_size)
+            # Initialize the token-type embeddings.
+            # self.init_method(self.tokentype_embeddings.weight)
+        else:
+            self.tokentype_embeddings = None
+
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+
+    def zero_parameters(self):
+        """Zero out all parameters in embedding."""
+        self.word_embeddings.weight.data.fill_(0)
+        self.word_embeddings.weight.shared = True
+        self.position_embeddings.weight.data.fill_(0)
+        self.position_embeddings.weight.shared = True
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings.weight.data.fill_(0)
+            self.tokentype_embeddings.weight.shared = True
+
+    def add_tokentype_embeddings(self, num_tokentypes):
+        """Add token-type embedding. This function is provided so we can add
+        token-type embeddings in case the pretrained model does not have it.
+        This allows us to load the model normally and then add this embedding.
+        """
+        if self.tokentype_embeddings is not None:
+            raise Exception('tokentype embeddings is already initialized')
+        if torch.distributed.get_rank() == 0:
+            print('adding embedding for {} tokentypes'.format(num_tokentypes), flush=True)
+        self.num_tokentypes = num_tokentypes
+        self.tokentype_embeddings = torch.nn.Embedding(num_tokentypes, self.hidden_size)
+        # Initialize the token-type embeddings.
+        # self.init_method(self.tokentype_embeddings.weight)
+
+    def forward(self, input_ids, position_ids=None, tokentype_ids=None):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        words_embeddings = self.word_embeddings(input_ids)
+
+        if position_ids is not None:
+            position_ids = position_ids.view(-1, input_shape[-1])
+        if position_ids is None:
+            position_ids = torch.arange(0, input_shape[-1] + 0, dtype=torch.long, device=get_current_device())
+            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+        position_embeddings = self.position_embeddings(position_ids)
+
+        embeddings = words_embeddings + position_embeddings
+
+        # Dropout.
+        with seed(ParallelMode.TENSOR):
+            embeddings = self.embedding_dropout(embeddings)
+        return embeddings
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='', keep_vars=False):
+        """For easy load."""
+
+        state_dict_ = {}
+        state_dict_[self._word_embeddings_key] \
+            = self.word_embeddings.state_dict(destination, prefix, keep_vars)
+        state_dict_[self._position_embeddings_key] \
+            = self.position_embeddings.state_dict(
+                destination, prefix, keep_vars)
+        if self.num_tokentypes > 0:
+            state_dict_[self._tokentype_embeddings_key] \
+                = self.tokentype_embeddings.state_dict(
+                    destination, prefix, keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        # Word embedding.
+        if self._word_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._word_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'word_embeddings' in key:
+                    state_dict_[key.split('word_embeddings.')[1]] \
+                        = state_dict[key]
+        self.word_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Position embedding.
+        if self._position_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._position_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'position_embeddings' in key:
+                    state_dict_[key.split('position_embeddings.')[1]] \
+                        = state_dict[key]
+        self.position_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Tokentype embedding.
+        if self.num_tokentypes > 0:
+            state_dict_ = {}
+            if self._tokentype_embeddings_key in state_dict:
+                state_dict_ = state_dict[self._tokentype_embeddings_key]
+            else:
+                # for backward compatibility.
+                for key in state_dict.keys():
+                    if 'tokentype_embeddings' in key:
+                        state_dict_[key.split('tokentype_embeddings.')[1]] \
+                            = state_dict[key]
+            if len(state_dict_.keys()) > 0:
+                self.tokentype_embeddings.load_state_dict(state_dict_, strict=strict)
+            else:
+                print('***WARNING*** expected tokentype embeddings in the '
+                      'checkpoint but could not find it',
+                      flush=True)
+
+
+class HiddenParallelEmbedding1D(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        init_method: method to initialize weights.
+    """
+
+    def __init__(self, num_embeddings, embedding_dim, dtype=torch.float, padding_idx: int = None, init_method=None):
+        super(HiddenParallelEmbedding1D, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        embed_dim_per_partition = divide(embedding_dim, gpc.tensor_parallel_size)
+        # Set the details for compatibility.
+        self.padding_idx = padding_idx
+        self.max_norm = None
+        self.norm_type = 2.
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+
+        # Allocate weights and initialize.
+        factory_kwargs = {'device': get_current_device(), 'dtype': dtype}
+        self.weight = Parameter(torch.empty(num_embeddings, embed_dim_per_partition, **factory_kwargs))
+        init.uniform_(self.weight, -1, 1)
+
+    def forward(self, input_):
+
+        # Get the embeddings.
+        output_parallel = F.embedding(input_, self.weight, self.padding_idx, self.max_norm, self.norm_type,
+                                      self.scale_grad_by_freq, self.sparse)
+
+        # Reduce across all the model parallel GPUs.
+        output = gather_forward_split_backward(output_parallel, ParallelMode.PARALLEL_1D, dim=-1)
+        return output
+
+
+@LAYERS.register_module
+class HiddenParallelGPTLMHead1D(ParallelLayer):
+    """
+    Language model head that shares the same parameters with the embedding matrix.
+    """
+
+    def __init__(
+        self,
+        embed=None,
+        embed_dim=None,
+        vocab_size=None,
+        dtype=None,
+    ):
+        super().__init__()
+        if embed is not None:
+            self.head = embed
+            self.synced_embed = True
+        else:
+            # self.embedding = HiddenParallelEmbedding1D(vocab_size, hidden_size, dtype, padding_idx)
+            # (hidden_size/q, vocab_size)
+            self.synced_embed = False
+            self.head = Linear1D_Row(in_features=embed_dim,
+                                     out_features=vocab_size,
+                                     bias=False,
+                                     dtype=dtype,
+                                     parallel_input=False)
+
+    def forward(self, x: Tensor) -> Tensor:
+        if self.synced_embed:
+            x = F.linear(x, self.head.weight)
+        else:
+            x = self.head(x)
+
+        return x
diff --git a/examples/language/gpt/titans/model/gpt1d.py b/examples/language/gpt/titans/model/gpt1d.py
new file mode 100644
index 000000000000..2edd03606b7d
--- /dev/null
+++ b/examples/language/gpt/titans/model/gpt1d.py
@@ -0,0 +1,349 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import math
+
+import torch
+from torch import Tensor
+from torch import nn as nn
+
+from colossalai import kernel
+from colossalai import nn as col_nn
+from colossalai.core import global_context as gpc
+from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
+from colossalai.nn.layer import Linear1D_Col, Linear1D_Row
+from colossalai.nn.layer.base_layer import ParallelLayer
+from colossalai.nn.layer.utils import ACT2FN, divide
+from colossalai.utils import checkpoint
+from colossalai.utils.activation_checkpoint import checkpoint
+
+__all__ = [
+    'GPTMLP1D', 'GPTSelfAttention1D', 'GPTTransformerLayer1D', 'FusedGPTSelfAttention1D', 'FusedGPTTransformerLayer1D'
+]
+
+
+class GPTMLP1D(ParallelLayer):
+
+    def __init__(
+        self,
+        in_features: int,
+        mlp_ratio: int,
+        act_func: str = 'gelu',
+        dropout_prob: float = 0.,
+        dtype=None,
+        checkpoint: bool = False,
+        skip_bias_add: bool = False,
+    ):
+        super().__init__()
+
+        self.in_features = in_features
+        self.mlp_ratio = mlp_ratio
+        self.checkpoint = checkpoint
+        self.skip_bias_add = skip_bias_add
+
+        self.act = ACT2FN[act_func]
+        skip_dense_1_add_bias = False
+
+        # Project to mlp_ratio * h.
+        self.dense_1 = Linear1D_Col(
+            self.in_features,
+            int(self.mlp_ratio * self.in_features),
+            dtype=dtype,
+            gather_output=False,
+            skip_bias_add=skip_dense_1_add_bias,
+        )
+
+        # Project back to h.
+        self.dense_2 = Linear1D_Row(
+            int(self.mlp_ratio * self.in_features),
+            self.in_features,
+            dtype=dtype,
+            parallel_input=True,
+        )
+
+        self.dropout = col_nn.Dropout(dropout_prob)
+
+    def _forward(self, hidden_states: Tensor) -> Tensor:
+        intermediate_output = self.dense_1(hidden_states)
+        intermediate_output = self.act(intermediate_output)
+
+        output = self.dense_2(intermediate_output)
+        output = self.dropout(output)
+        return output
+
+    def _checkpoint_forward(self, hidden_states: Tensor) -> Tensor:
+        return checkpoint(self._forward, False, hidden_states)
+
+    def forward(self, hidden_states: Tensor) -> Tensor:
+        if self.checkpoint:
+            return self._checkpoint_forward(hidden_states)
+        else:
+            return self._forward(hidden_states)
+
+
+class GenericGPTSelfAttention1D(ParallelLayer):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_attention_heads: int,
+        attention_dropout_prob: float,
+        hidden_dropout_prob: float,
+        dtype=None,
+        checkpoint: bool = False,
+        max_position_embeddings=1024,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.attention_head_size = divide(hidden_size, num_attention_heads)
+        self.num_attention_heads_per_partition = divide(num_attention_heads, gpc.tensor_parallel_size)
+        self.hidden_size_per_partition = divide(hidden_size, gpc.tensor_parallel_size)
+        self.checkpoint = checkpoint
+        self.query_key_value = Linear1D_Col(
+            hidden_size,
+            3 * hidden_size,
+            dtype=dtype,
+        )
+        self.attention_dropout = col_nn.Dropout(attention_dropout_prob)
+        self.dense = Linear1D_Row(
+            hidden_size,
+            hidden_size,
+            dtype=dtype,
+            parallel_input=True,
+        )
+        self.dropout = col_nn.Dropout(hidden_dropout_prob)
+
+    def softmax_forward(self, attention_scores, attention_mask, query_layer, key_layer):
+        raise NotImplementedError
+
+    def _forward(self, hidden_states: Tensor, attention_mask=None) -> Tensor:
+        query_key_value = self.query_key_value(hidden_states)
+        new_qkv_shape = query_key_value.shape[:-1] + \
+            (self.num_attention_heads_per_partition, 3 * self.attention_head_size)
+        query_key_value = query_key_value.view(new_qkv_shape)
+        query_key_value = query_key_value.permute((0, 2, 1, 3))
+        query_layer, key_layer, value_layer = torch.chunk(query_key_value, 3, dim=-1)
+
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = self.softmax_forward(attention_scores, attention_mask, query_layer, key_layer)
+
+        attention_scores = attention_scores.type(value_layer.dtype)
+
+        attention_probs = self.attention_dropout(attention_scores)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.transpose(1, 2)
+        new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+        output = self.dense(context_layer)
+        output = self.dropout(output)
+
+        return output
+
+    def _checkpoint_forward(self, hidden_states: Tensor, attention_mask=None) -> Tensor:
+        return checkpoint(self._forward, False, hidden_states, attention_mask)
+
+    def forward(self, hidden_states: Tensor, attention_mask=None) -> Tensor:
+        if self.checkpoint:
+            return self._checkpoint_forward(hidden_states, attention_mask)
+        else:
+            return self._forward(hidden_states, attention_mask)
+
+
+class GPTSelfAttention1D(GenericGPTSelfAttention1D):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_attention_heads: int,
+                 attention_dropout_prob: float,
+                 hidden_dropout_prob: float,
+                 dtype=None,
+                 checkpoint: bool = False,
+                 max_position_embeddings=1024):
+        super().__init__(hidden_size,
+                         num_attention_heads,
+                         attention_dropout_prob,
+                         hidden_dropout_prob,
+                         dtype=dtype,
+                         checkpoint=checkpoint,
+                         max_position_embeddings=max_position_embeddings)
+        self.softmax = nn.Softmax(dim=-1)
+        max_positions = max_position_embeddings
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones((max_positions, max_positions),
+                                  dtype=torch.uint8)).view(1, 1, max_positions, max_positions),
+        )
+        self.register_buffer("masked_bias", torch.tensor(-1e4))
+
+    def softmax_forward(self, attention_scores, attention_mask, query_layer, key_layer):
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # causal mask
+        query_length, key_length = query_layer.size(-2), key_layer.size(-2)
+        causal_mask = self.bias[:, :, key_length - query_length:key_length, :key_length].bool()
+        attention_scores = torch.where(causal_mask, attention_scores, self.masked_bias.to(attention_scores))
+        if attention_mask is not None:
+            # Apply the attention mask
+            attention_scores = attention_scores + attention_mask
+        attention_scores = self.softmax(attention_scores)
+        return attention_scores
+
+
+class FusedGPTSelfAttention1D(GenericGPTSelfAttention1D):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_attention_heads: int,
+                 attention_dropout_prob: float,
+                 hidden_dropout_prob: float,
+                 dtype=None,
+                 checkpoint: bool = False,
+                 max_position_embeddings=1024):
+        super().__init__(hidden_size,
+                         num_attention_heads,
+                         attention_dropout_prob,
+                         hidden_dropout_prob,
+                         dtype=dtype,
+                         checkpoint=checkpoint,
+                         max_position_embeddings=max_position_embeddings)
+        self.softmax = kernel.FusedScaleMaskSoftmax(input_in_fp16=True,
+                                                    input_in_bf16=False,
+                                                    attn_mask_type=AttnMaskType.causal,
+                                                    scaled_masked_softmax_fusion=True,
+                                                    mask_func=None,
+                                                    softmax_in_fp32=True,
+                                                    scale=math.sqrt(self.attention_head_size))
+
+    def softmax_forward(self, attention_scores, attention_mask, query_layer, key_layer):
+        return self.softmax(attention_scores, attention_mask)
+
+
+class GenericGPTTransformerLayer1D(ParallelLayer):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_attention_heads: int,
+                 act_func: str = 'gelu',
+                 mlp_ratio: float = 4.0,
+                 attention_dropout_prob: float = 0.,
+                 hidden_dropout_prob: float = 0.,
+                 dtype=None,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 1e-5,
+                 apply_post_layer_norm: bool = False,
+                 attention=None,
+                 layer_norm=None):
+        super().__init__()
+        self.checkpoint = checkpoint
+        self.dtype = dtype
+        self.norm1 = layer_norm(hidden_size, eps=layer_norm_epsilon)
+        self.apply_post_layer_norm = apply_post_layer_norm
+        self.attention = attention(
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            attention_dropout_prob=attention_dropout_prob,
+            hidden_dropout_prob=hidden_dropout_prob,
+            dtype=dtype,
+            max_position_embeddings=max_position_embeddings,
+            checkpoint=False,
+        )
+
+        self.norm2 = layer_norm(hidden_size, eps=layer_norm_epsilon)
+        self.mlp = GPTMLP1D(
+            in_features=hidden_size,
+            dropout_prob=hidden_dropout_prob,
+            act_func=act_func,
+            mlp_ratio=mlp_ratio,
+            dtype=dtype,
+            checkpoint=False,
+        )
+
+    def _forward(self, hidden_states, attention_mask) -> Tensor:
+        if not self.apply_post_layer_norm:
+            residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        if self.apply_post_layer_norm:
+            residual = hidden_states
+        attention_output = self.attention(hidden_states, attention_mask)
+        hidden_states = residual + attention_output
+
+        if not self.apply_post_layer_norm:
+            residual = hidden_states
+        hidden_states = self.norm2(hidden_states)
+        if self.apply_post_layer_norm:
+            residual = hidden_states
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + feed_forward_hidden_states
+
+        output = (hidden_states, attention_mask)
+        return output
+
+    def forward(self, hidden_states, attention_mask):
+        if self.checkpoint:
+            return checkpoint(self._forward, False, hidden_states, attention_mask)
+        else:
+            return self._forward(hidden_states, attention_mask)
+
+
+class GPTTransformerLayer1D(GenericGPTTransformerLayer1D):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_attention_heads: int,
+                 act_func: str = 'gelu',
+                 mlp_ratio: float = 4,
+                 attention_dropout_prob: float = 0,
+                 hidden_dropout_prob: float = 0,
+                 dtype=None,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 0.00001,
+                 apply_post_layer_norm: bool = False):
+        attention = GPTSelfAttention1D
+        layer_norm = nn.LayerNorm
+        super().__init__(hidden_size,
+                         num_attention_heads,
+                         act_func=act_func,
+                         mlp_ratio=mlp_ratio,
+                         attention_dropout_prob=attention_dropout_prob,
+                         hidden_dropout_prob=hidden_dropout_prob,
+                         dtype=dtype,
+                         checkpoint=checkpoint,
+                         max_position_embeddings=max_position_embeddings,
+                         layer_norm_epsilon=layer_norm_epsilon,
+                         apply_post_layer_norm=apply_post_layer_norm,
+                         attention=attention,
+                         layer_norm=layer_norm)
+
+
+class FusedGPTTransformerLayer1D(GenericGPTTransformerLayer1D):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_attention_heads: int,
+                 act_func: str = 'gelu',
+                 mlp_ratio: float = 4,
+                 attention_dropout_prob: float = 0,
+                 hidden_dropout_prob: float = 0,
+                 dtype=None,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 0.00001,
+                 apply_post_layer_norm: bool = False):
+        attention = FusedGPTSelfAttention1D
+        layer_norm = kernel.LayerNorm
+        super().__init__(hidden_size,
+                         num_attention_heads,
+                         act_func=act_func,
+                         mlp_ratio=mlp_ratio,
+                         attention_dropout_prob=attention_dropout_prob,
+                         hidden_dropout_prob=hidden_dropout_prob,
+                         dtype=dtype,
+                         checkpoint=checkpoint,
+                         max_position_embeddings=max_position_embeddings,
+                         layer_norm_epsilon=layer_norm_epsilon,
+                         apply_post_layer_norm=apply_post_layer_norm,
+                         attention=attention,
+                         layer_norm=layer_norm)
diff --git a/examples/language/gpt/titans/model/pipeline_gpt1d.py b/examples/language/gpt/titans/model/pipeline_gpt1d.py
new file mode 100644
index 000000000000..30180285bc70
--- /dev/null
+++ b/examples/language/gpt/titans/model/pipeline_gpt1d.py
@@ -0,0 +1,322 @@
+import inspect
+
+# import model_zoo.gpt.gpt as col_gpt
+import titans.model.gpt.gpt as col_gpt
+import torch
+import torch.nn as nn
+
+from colossalai import kernel
+from colossalai import nn as col_nn
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import get_dist_logger
+from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
+from colossalai.pipeline.utils import partition_uniform
+
+from .embed import HiddenParallelEmbedding, HiddenParallelGPTLMHead1D, VocabParallelEmbedding, VocabParallelGPTLMHead1D
+from .gpt1d import FusedGPTTransformerLayer1D, GPTTransformerLayer1D
+
+__all__ = [
+    'GPT2_small_pipeline_1D',
+    'GPT2_exlarge_pipeline_1D',
+    'GPT3_pipeline_1D',
+    'GPT2_exlarge_pipeline_hybrid',
+    'GPT2_small_pipeline_hybrid',
+    'GPT3_pipeline_hybrid',
+]
+
+
+class GenericPipelineGPT(nn.Module):
+
+    def __init__(self, embedding=None, blocks=None, norm=None, head=None) -> None:
+        super().__init__()
+        self.embedding = embedding
+        self.blocks = blocks
+        self.norm = norm
+        self.head = head
+        assert blocks is not None
+        if norm is not None or head is not None:
+            assert norm is not None and head is not None
+
+    def forward(self, hidden_states=None, input_ids=None, attention_mask=None):
+        if self.embedding is not None:
+            hidden_states = self.embedding(input_ids=input_ids)
+        batch_size = hidden_states.shape[0]
+        attention_mask = attention_mask.view(batch_size, -1)
+        attention_mask = attention_mask[:, None, None, :]
+        attention_mask = attention_mask.to(dtype=hidden_states.dtype)    # fp16 compatibility
+        attention_mask = (1.0 - attention_mask) * -10000.0
+        for block in self.blocks:
+            hidden_states, attention_mask = block(hidden_states, attention_mask)
+        if self.norm is not None:
+            hidden_states = self.head(self.norm(hidden_states))
+        return hidden_states
+
+
+class PipelineGPT1D(GenericPipelineGPT):
+
+    def __init__(self,
+                 num_layers: int = 12,
+                 hidden_size: int = 768,
+                 num_attention_heads: int = 12,
+                 vocab_size: int = 50304,
+                 embed_drop_rate: float = 0.,
+                 act_func: str = 'gelu',
+                 mlp_ratio: int = 4.0,
+                 attn_drop_rate: float = 0.,
+                 drop_rate: float = 0.,
+                 dtype: torch.dtype = torch.float,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 1e-5,
+                 apply_post_layer_norm: bool = False,
+                 first: bool = False,
+                 last: bool = False,
+                 embed_split_hidden=False):
+        embedding = None
+        norm = None
+        head = None
+        embed_cls = VocabParallelEmbedding
+        head_cls = VocabParallelGPTLMHead1D
+        if embed_split_hidden:
+            embed_cls = HiddenParallelEmbedding
+            head_cls = HiddenParallelGPTLMHead1D
+        if first:
+            embedding = embed_cls(hidden_size, vocab_size, max_position_embeddings, embed_drop_rate, dtype=dtype)
+        blocks = nn.ModuleList([
+            GPTTransformerLayer1D(hidden_size,
+                                  num_attention_heads,
+                                  act_func=act_func,
+                                  mlp_ratio=mlp_ratio,
+                                  attention_dropout_prob=attn_drop_rate,
+                                  hidden_dropout_prob=drop_rate,
+                                  dtype=dtype,
+                                  checkpoint=checkpoint,
+                                  max_position_embeddings=max_position_embeddings,
+                                  layer_norm_epsilon=layer_norm_epsilon,
+                                  apply_post_layer_norm=apply_post_layer_norm) for _ in range(num_layers)
+        ])
+        if last:
+            norm = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
+            head = head_cls(vocab_size=vocab_size, embed_dim=hidden_size, dtype=dtype)
+        super().__init__(embedding=embedding, blocks=blocks, norm=norm, head=head)
+
+
+class FusedPipelineGPT1D(GenericPipelineGPT):
+
+    def __init__(self,
+                 num_layers: int = 12,
+                 hidden_size: int = 768,
+                 num_attention_heads: int = 12,
+                 vocab_size: int = 50304,
+                 embed_drop_rate: float = 0.,
+                 act_func: str = 'gelu',
+                 mlp_ratio: int = 4.0,
+                 attn_drop_rate: float = 0.,
+                 drop_rate: float = 0.,
+                 dtype: torch.dtype = torch.float,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 1e-5,
+                 apply_post_layer_norm: bool = False,
+                 first: bool = False,
+                 last: bool = False,
+                 embed_split_hidden=False):
+        embedding = None
+        norm = None
+        head = None
+        embed_cls = VocabParallelEmbedding
+        head_cls = VocabParallelGPTLMHead1D
+        if embed_split_hidden:
+            embed_cls = HiddenParallelEmbedding
+            head_cls = HiddenParallelGPTLMHead1D
+        if first:
+            embedding = embed_cls(hidden_size, vocab_size, max_position_embeddings, embed_drop_rate, dtype=dtype)
+        blocks = nn.ModuleList([
+            FusedGPTTransformerLayer1D(hidden_size,
+                                       num_attention_heads,
+                                       act_func=act_func,
+                                       mlp_ratio=mlp_ratio,
+                                       attention_dropout_prob=attn_drop_rate,
+                                       hidden_dropout_prob=drop_rate,
+                                       dtype=dtype,
+                                       checkpoint=checkpoint,
+                                       max_position_embeddings=max_position_embeddings,
+                                       layer_norm_epsilon=layer_norm_epsilon,
+                                       apply_post_layer_norm=apply_post_layer_norm) for _ in range(num_layers)
+        ])
+        if last:
+            norm = kernel.LayerNorm(hidden_size, eps=layer_norm_epsilon)
+            head = head_cls(vocab_size=vocab_size, embed_dim=hidden_size, dtype=dtype)
+        super().__init__(embedding=embedding, blocks=blocks, norm=norm, head=head)
+
+    def forward(self, hidden_states=None, input_ids=None, attention_mask=None):
+        if self.embedding is not None:
+            hidden_states = self.embedding(input_ids=input_ids)
+        attention_mask = attention_mask.to(dtype=hidden_states.dtype)    # fp16 compatibility
+        for block in self.blocks:
+            hidden_states, attention_mask = block(hidden_states, attention_mask)
+        if self.norm is not None:
+            hidden_states = self.head(self.norm(hidden_states))
+        return hidden_states
+
+
+class PipelineGPTHybrid(GenericPipelineGPT):
+
+    def __init__(self,
+                 num_layers: int = 12,
+                 hidden_size: int = 768,
+                 num_attention_heads: int = 12,
+                 vocab_size: int = 50304,
+                 embed_drop_rate: float = 0.,
+                 act_func: str = 'gelu',
+                 mlp_ratio: int = 4,
+                 attn_drop_rate: float = 0.,
+                 drop_rate: float = 0.,
+                 dtype: torch.dtype = torch.float,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 1e-5,
+                 apply_post_layer_norm: bool = False,
+                 first: bool = False,
+                 last: bool = False,
+                 embed_split_hidden=False):
+        embedding = None
+        norm = None
+        head = None
+        if first:
+            embedding = col_gpt.GPTEmbedding(hidden_size,
+                                             vocab_size,
+                                             max_position_embeddings,
+                                             dropout=embed_drop_rate,
+                                             dtype=dtype)
+        blocks = nn.ModuleList([
+            col_gpt.GPTBlock(hidden_size,
+                             num_attention_heads,
+                             mlp_ratio=mlp_ratio,
+                             attention_dropout=attn_drop_rate,
+                             dropout=drop_rate,
+                             dtype=dtype,
+                             checkpoint=checkpoint,
+                             activation=nn.functional.gelu) for _ in range(num_layers)
+        ])
+        if last:
+            norm = col_nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
+            # head = col_gpt.GPTLMHead(vocab_size=vocab_size,
+            #                          hidden_size=hidden_size,
+            #                          dtype=dtype,
+            #                          bias=False)
+            head = col_nn.Classifier(hidden_size, vocab_size, dtype=dtype, bias=False)
+        super().__init__(embedding=embedding, blocks=blocks, norm=norm, head=head)
+
+
+def _filter_kwargs(func, kwargs):
+    sig = inspect.signature(func)
+    return {k: v for k, v in kwargs.items() if k in sig.parameters}
+
+
+def _build_generic_gpt_pipeline_1d(module_cls, num_layers, num_chunks, device=torch.device('cuda'), **kwargs):
+    logger = get_dist_logger()
+
+    if gpc.is_initialized(ParallelMode.PIPELINE):
+        pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE)
+        pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
+    else:
+        pipeline_size = 1
+        pipeline_rank = 0
+    rank = gpc.get_global_rank()
+
+    if pipeline_size > 1:
+        wrapper = PipelineSharedModuleWrapper([0, pipeline_size - 1])
+    else:
+        wrapper = None
+    parts = partition_uniform(num_layers, pipeline_size, num_chunks)[pipeline_rank]
+    models = []
+    for start, end in parts:
+        kwargs['num_layers'] = end - start
+        kwargs['first'] = start == 0
+        kwargs['last'] = end == num_layers
+        logger.info(f'Rank{rank} build layer {start}-{end}, {end-start}/{num_layers} layers')
+        chunk = module_cls(**_filter_kwargs(module_cls.__init__, kwargs)).to(device)
+
+        if wrapper is not None:
+            if start == 0:
+                wrapper.register_module(chunk.embedding.word_embeddings)
+            elif end == num_layers:
+                wrapper.register_module(chunk.head)
+        models.append(chunk)
+    if len(models) == 1:
+        model = models[0]
+    else:
+        model = nn.ModuleList(models)
+
+    numel = 0
+    for _, param in model.named_parameters(recurse=True):
+        numel += param.numel()
+    logger.info(f'Rank{rank}/{pipeline_rank} model size = {numel * 2 / 1e9} GB')
+    return model
+
+
+def _build_gpt_pipeline_1d(num_layers, num_chunks, device=torch.device('cuda'), fused=False, **kwargs):
+    model = FusedPipelineGPT1D if fused else PipelineGPT1D
+    return _build_generic_gpt_pipeline_1d(model, num_layers, num_chunks, device, **kwargs)
+
+
+def _build_gpt_pipeline_hybrid(num_layers, num_chunks, device=torch.device('cuda'), **kwargs):
+    return _build_generic_gpt_pipeline_1d(PipelineGPTHybrid, num_layers, num_chunks, device, **kwargs)
+
+
+def GPT2_small_pipeline_1D(num_chunks=1, checkpoint=False, dtype=torch.float, embed_split_hidden=False, fused=False):
+    cfg = dict(hidden_size=768,
+               num_attention_heads=12,
+               checkpoint=checkpoint,
+               dtype=dtype,
+               embed_split_hidden=embed_split_hidden)
+    return _build_gpt_pipeline_1d(12, num_chunks, fused=fused, **cfg)
+
+
+def GPT2_exlarge_pipeline_1D(num_chunks=1, checkpoint=False, dtype=torch.float, embed_split_hidden=False, fused=False):
+    cfg = dict(hidden_size=1600,
+               num_attention_heads=32,
+               checkpoint=checkpoint,
+               dtype=dtype,
+               embed_split_hidden=embed_split_hidden)
+    return _build_gpt_pipeline_1d(48, num_chunks, fused=fused, **cfg)
+
+
+def GPT3_pipeline_1D(num_chunks=1, checkpoint=False, dtype=torch.float, embed_split_hidden=False, fused=False):
+    cfg = dict(hidden_size=12288,
+               num_attention_heads=96,
+               checkpoint=checkpoint,
+               max_position_embeddings=2048,
+               dtype=dtype,
+               embed_split_hidden=embed_split_hidden)
+    return _build_gpt_pipeline_1d(96, num_chunks, fused=fused, **cfg)
+
+
+def GPT2_exlarge_pipeline_hybrid(num_chunks=1, checkpoint=False, dtype=torch.float, embed_split_hidden=False):
+    cfg = dict(hidden_size=1600,
+               num_attention_heads=32,
+               checkpoint=checkpoint,
+               dtype=dtype,
+               embed_split_hidden=embed_split_hidden)
+    return _build_gpt_pipeline_hybrid(48, num_chunks, **cfg)
+
+
+def GPT2_small_pipeline_hybrid(num_chunks=1, checkpoint=False, dtype=torch.float, embed_split_hidden=False):
+    cfg = dict(hidden_size=768,
+               num_attention_heads=12,
+               checkpoint=checkpoint,
+               dtype=dtype,
+               embed_split_hidden=embed_split_hidden)
+    return _build_gpt_pipeline_hybrid(12, num_chunks, **cfg)
+
+
+def GPT3_pipeline_hybrid(num_chunks=1, checkpoint=False, dtype=torch.float, embed_split_hidden=False):
+    cfg = dict(hidden_size=12288,
+               num_attention_heads=96,
+               checkpoint=checkpoint,
+               max_position_embeddings=2048,
+               dtype=dtype,
+               embed_split_hidden=embed_split_hidden)
+    return _build_gpt_pipeline_hybrid(96, num_chunks, **cfg)
diff --git a/examples/language/gpt/titans/requirements.txt b/examples/language/gpt/titans/requirements.txt
new file mode 100644
index 000000000000..64ff7a4abcd8
--- /dev/null
+++ b/examples/language/gpt/titans/requirements.txt
@@ -0,0 +1,4 @@
+torch==1.12.1
+titans==0.0.7
+colossalai==0.2.0+torch1.12cu11.3
+-f https://release.colossalai.org
diff --git a/examples/language/gpt/titans/run.sh b/examples/language/gpt/titans/run.sh
new file mode 100644
index 000000000000..157bd377aa34
--- /dev/null
+++ b/examples/language/gpt/titans/run.sh
@@ -0,0 +1,2 @@
+export DATA=/data/scratch/gpt_data/small-gpt-dataset.json
+colossalai run --nproc_per_node=4 train_gpt.py --config ./configs/gpt2_small_zero3_pp1d.py --from_torch
diff --git a/examples/language/gpt/titans/test_ci.sh b/examples/language/gpt/titans/test_ci.sh
new file mode 100644
index 000000000000..7cb24c1a4082
--- /dev/null
+++ b/examples/language/gpt/titans/test_ci.sh
@@ -0,0 +1 @@
+colossalai run --nproc_per_node=4 train_gpt.py --config ./configs/gpt2_small_zero3_pp1d.py --from_torch --use_dummy_dataset
diff --git a/examples/language/gpt/titans/train_gpt.py b/examples/language/gpt/titans/train_gpt.py
new file mode 100644
index 000000000000..1380b4b3a7da
--- /dev/null
+++ b/examples/language/gpt/titans/train_gpt.py
@@ -0,0 +1,148 @@
+import contextlib
+import os
+
+import torch
+import torch.nn as nn
+from titans.model.gpt import GPTLMLoss
+
+import colossalai
+import colossalai.utils as utils
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.nn import LinearWarmupLR
+from colossalai.trainer import Trainer, hooks
+from colossalai.utils import colo_set_process_memory_fraction, is_using_pp
+from colossalai.utils.timer import MultiTimer
+from colossalai.zero.init_ctx import ZeroInitContext
+
+
+def calc_local_model_size(model: torch.nn.Module):
+    numel_per_device = 0
+    for p in model.parameters():
+        numel_per_device += p.numel()
+    return numel_per_device
+
+
+VOCAB_SIZE = 50257
+
+
+def main():
+    parser = colossalai.get_default_parser()
+    parser.add_argument('--from_torch', default=False, action='store_true')
+    parser.add_argument('--use_dummy_dataset', default=True, action='store_true')
+    args = parser.parse_args()
+    disable_existing_loggers()
+    if args.from_torch:
+        colossalai.launch_from_torch(config=args.config)
+    else:
+        colossalai.launch_from_slurm(config=args.config, host=args.host, port=29500, seed=42)
+    logger = get_dist_logger()
+
+    if not args.use_dummy_dataset:
+        data_path = os.environ['DATA']
+        logger.info(f'Build data loader from path {data_path}', ranks=[0])
+        from dataset.webtext import WebtextDataset
+        train_ds = WebtextDataset(os.environ['DATA'], seq_len=gpc.config.SEQ_LEN)
+        train_dataloader = utils.get_dataloader(train_ds,
+                                                seed=42,
+                                                batch_size=gpc.config.BATCH_SIZE,
+                                                pin_memory=True,
+                                                shuffle=True,
+                                                drop_last=True)
+    else:
+        # build a dummy train_dataloader
+        logger.info('Build data loader using dummy data', ranks=[0])
+
+        def get_data(batch_size, seq_len, vocab_size):
+            input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device())
+            attention_mask = torch.ones_like(input_ids)
+            return input_ids, attention_mask
+
+        # 10 iterations
+        input_ids, attn_mask = get_data(gpc.config.BATCH_SIZE * 10, gpc.config.SEQ_LEN, VOCAB_SIZE)
+        from torch.utils.data import DataLoader, Dataset
+
+        class TextSamplerDataset(Dataset):
+
+            def __init__(self, data, seq_len):
+                super().__init__()
+                self.data = data
+                self.seq_len = seq_len
+
+            def __getitem__(self, index):
+                rand_start = torch.randint(0, self.data.size(0) - self.seq_len, (1,))
+                full_seq = self.data[rand_start:rand_start + self.seq_len + 1].long()
+                return full_seq.cuda()
+
+            def __len__(self):
+                return self.data.size(0) // self.seq_len
+
+        def cycle(loader):
+            while True:
+                for data in loader:
+                    yield data
+
+        train_dataset = TextSamplerDataset(input_ids, gpc.config.SEQ_LEN)
+        train_dataloader = DataLoader(train_dataset, batch_size=gpc.config.BATCH_SIZE)
+
+    logger.info('Build model', ranks=[0])
+    use_pipeline = is_using_pp()
+    use_interleaved = hasattr(gpc.config.model, 'num_chunks')
+    use_zero3 = hasattr(gpc.config, 'zero')
+    ctx = contextlib.nullcontext()
+    if use_zero3:
+        ctx = ZeroInitContext(target_device=torch.cuda.current_device(),
+                              shard_strategy=gpc.config.zero.model_config.shard_strategy,
+                              shard_param=True)
+    with ctx:
+        model = gpc.config.model.pop('type')(**gpc.config.model)
+    if use_pipeline and use_interleaved and not isinstance(model, nn.ModuleList):
+        model = nn.ModuleList([model])
+
+    if use_zero3:
+        numel = ctx.model_numel_tensor.item()
+    else:
+        numel = calc_local_model_size(model)
+
+    tflop = numel * gpc.config.BATCH_SIZE * gpc.config.SEQ_LEN \
+        * gpc.get_world_size(ParallelMode.MODEL) * gpc.get_world_size(ParallelMode.DATA) * 8 / (1024 ** 4)
+
+    criterion = getattr(gpc.config, 'loss_fn', None)
+    if criterion is not None:
+        criterion = criterion.type()
+    else:
+        criterion = GPTLMLoss()
+    logger.info('Build optimizer', ranks=[0])
+    optimizer = gpc.config.optimizer.pop('type')(model.parameters(), **gpc.config.optimizer)
+    lr_scheduler = LinearWarmupLR(optimizer, total_steps=gpc.config.NUM_EPOCHS, warmup_steps=5)
+    engine, train_dataloader, _, lr_scheduler = colossalai.initialize(model,
+                                                                      optimizer,
+                                                                      criterion,
+                                                                      train_dataloader=train_dataloader,
+                                                                      lr_scheduler=lr_scheduler)
+    global_batch_size = gpc.config.BATCH_SIZE * \
+        gpc.get_world_size(ParallelMode.DATA) * getattr(gpc.config, "gradient_accumulation", 1)
+    logger.info(f'Init done, global batch size = {global_batch_size}', ranks=[0])
+    timier = MultiTimer()
+    trainer = Trainer(engine=engine, logger=logger, timer=timier)
+    hook_list = [
+        hooks.LossHook(),
+        hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True),
+        hooks.LogMetricByEpochHook(logger),
+        hooks.ThroughputHook(ignored_steps=10, tflop_per_step=tflop),
+        hooks.LogMetricByStepHook(),
+        hooks.LogMemoryByEpochHook(logger),
+    # hooks.LogMemoryByEpochHook(logger),
+    # hooks.LogTimingByEpochHook(timer, logger),
+    ]
+    trainer.fit(train_dataloader=train_dataloader,
+                epochs=gpc.config.NUM_EPOCHS,
+                test_interval=1,
+                hooks=hook_list,
+                display_progress=True,
+                return_output_label=False)
+
+
+if __name__ == '__main__':
+    main()

From 315e1433ce4a4f8a7a1c2de6b87ccc63a7203941 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Mon, 16 Jan 2023 15:17:27 +0800
Subject: [PATCH 186/209] polish readme

---
 examples/language/gpt/README.md        | 17 ++++++++++++++---
 examples/language/gpt/titans/README.md | 12 ++++++------
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/examples/language/gpt/README.md b/examples/language/gpt/README.md
index 8fdf6be3b6d9..7e6acb3d399b 100644
--- a/examples/language/gpt/README.md
+++ b/examples/language/gpt/README.md
@@ -39,9 +39,15 @@ If you want to test ZeRO1 and ZeRO2 in Colossal-AI, you need to ensure Colossal-
 For simplicity, the input data is randonly generated here.
 
 ## Training
-We provide two solutions. One utilizes the hybrid parallel strategies of Gemini, DDP/ZeRO, and Tensor Parallelism.
-The other one uses Pipeline Parallelism Only.
-In the future, we are going merge them together and they can be used orthogonally to each other.
+We provide two stable solutions.
+One utilizes the Gemini to implement hybrid parallel strategies of Gemini, DDP/ZeRO, and Tensor Parallelism for a huggingface GPT model.
+The other one use [Titans](https://github.com/hpcaitech/Titans), a distributed executed model zoo maintained by ColossalAI,to implement the hybrid parallel strategies of TP + ZeRO + PP.
+
+We recommend using Gemini to qucikly run your model in a distributed manner.
+It doesn't require significant changes to the model structures, therefore you can apply it on a new model easily.
+And use Titans as an advanced weapon to pursue a more extreme performance.
+Titans has included the some typical models, such as Vit and GPT.
+However, it requires some efforts to start if facing a new model structure.
 
 ### GeminiDPP/ZeRO + Tensor Parallelism
 ```bash
@@ -56,6 +62,11 @@ The `train_gpt_demo.py` provides three distributed plans, you can choose the pla
 - Pytorch DDP
 - Pytorch ZeRO
 
+### Titans (Tensor Parallelism) + ZeRO + Pipeline Parallelism
+
+Titans provides a customized GPT model, which uses distributed operators as building blocks.
+In [./titans/README.md], we provide a hybrid parallelism of ZeRO, TP and PP.
+You can switch parallel strategies using a config file.
 
 ## Performance
 
diff --git a/examples/language/gpt/titans/README.md b/examples/language/gpt/titans/README.md
index 14c07442b82a..9fc26ad801db 100644
--- a/examples/language/gpt/titans/README.md
+++ b/examples/language/gpt/titans/README.md
@@ -5,7 +5,7 @@
 You can download the preprocessed sample dataset for this demo via our [Google Drive sharing link](https://drive.google.com/file/d/1QKI6k-e2gJ7XgS8yIpgPPiMmwiBP_BPE/view?usp=sharing).
 
 
-You can also avoid dataset preparation by using `--use_dummy_data` during running.
+You can also avoid dataset preparation by using `--use_dummy_dataset` during running.
 
 ## Run this Demo
 
@@ -13,15 +13,15 @@ Use the following commands to install prerequisites.
 
 ```bash
 # assuming using cuda 11.3
-conda install pytorch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0 cudatoolkit=11.3 -c pytorch
-pip install colossalai==0.1.9+torch1.11cu11.3 -f https://release.colossalai.org
+pip install -r requirements.txt
 ```
 
 Use the following commands to execute training.
 
 ```Bash
 #!/usr/bin/env sh
-export DATA=/path/to/small-gpt-dataset.json'
+# if you want to use real dataset, then remove --use_dummy_dataset
+# export DATA=/path/to/small-gpt-dataset.json'
 
 # run on a single node
 colossalai run --nproc_per_node=<num_gpus> train_gpt.py --config configs/<config_file> --from_torch
@@ -34,14 +34,14 @@ colossalai run --nproc_per_node=<num_gpus> \
    train_gpt.py \
    --config configs/<config_file> \
    --from_torch \
-   --use_dummy_data
+   --use_dummy_dataset
 
 # run on multiple nodes with slurm
 srun python \
    train_gpt.py \
    --config configs/<config_file> \
    --host <master_node> \
-   --use_dummy_data
+   --use_dummy_dataset
 
 ```
 

From 92f65fbbe36ce92bc52638382b21384506c55aae Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Mon, 16 Jan 2023 15:18:49 +0800
Subject: [PATCH 187/209] remove license

---
 examples/language/gpt/titans/README.md | 48 --------------------------
 1 file changed, 48 deletions(-)
 delete mode 100644 examples/language/gpt/titans/README.md

diff --git a/examples/language/gpt/titans/README.md b/examples/language/gpt/titans/README.md
deleted file mode 100644
index 9fc26ad801db..000000000000
--- a/examples/language/gpt/titans/README.md
+++ /dev/null
@@ -1,48 +0,0 @@
-# Run GPT With Colossal-AI
-
-## How to Prepare Webtext Dataset
-
-You can download the preprocessed sample dataset for this demo via our [Google Drive sharing link](https://drive.google.com/file/d/1QKI6k-e2gJ7XgS8yIpgPPiMmwiBP_BPE/view?usp=sharing).
-
-
-You can also avoid dataset preparation by using `--use_dummy_dataset` during running.
-
-## Run this Demo
-
-Use the following commands to install prerequisites.
-
-```bash
-# assuming using cuda 11.3
-pip install -r requirements.txt
-```
-
-Use the following commands to execute training.
-
-```Bash
-#!/usr/bin/env sh
-# if you want to use real dataset, then remove --use_dummy_dataset
-# export DATA=/path/to/small-gpt-dataset.json'
-
-# run on a single node
-colossalai run --nproc_per_node=<num_gpus> train_gpt.py --config configs/<config_file> --from_torch
-
-# run on multiple nodes with slurm
-colossalai run --nproc_per_node=<num_gpus> \
-   --master_addr <hostname> \
-   --master_port <port-number> \
-   --hosts <list-of-hostname-separated-by-comma> \
-   train_gpt.py \
-   --config configs/<config_file> \
-   --from_torch \
-   --use_dummy_dataset
-
-# run on multiple nodes with slurm
-srun python \
-   train_gpt.py \
-   --config configs/<config_file> \
-   --host <master_node> \
-   --use_dummy_dataset
-
-```
-
-You can set the `<config_file>` to any file in the `configs` folder. To simply get it running, you can start with `gpt_small_zero3_pp1d.py` on a single node first. You can view the explanations in the config file regarding how to change the parallel setting.

From 38424db6ffbdafbc9e65a1802640202a07e53c50 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Mon, 16 Jan 2023 15:21:22 +0800
Subject: [PATCH 188/209] polish code

---
 examples/language/gpt/titans/README.md | 48 ++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 examples/language/gpt/titans/README.md

diff --git a/examples/language/gpt/titans/README.md b/examples/language/gpt/titans/README.md
new file mode 100644
index 000000000000..9fc26ad801db
--- /dev/null
+++ b/examples/language/gpt/titans/README.md
@@ -0,0 +1,48 @@
+# Run GPT With Colossal-AI
+
+## How to Prepare Webtext Dataset
+
+You can download the preprocessed sample dataset for this demo via our [Google Drive sharing link](https://drive.google.com/file/d/1QKI6k-e2gJ7XgS8yIpgPPiMmwiBP_BPE/view?usp=sharing).
+
+
+You can also avoid dataset preparation by using `--use_dummy_dataset` during running.
+
+## Run this Demo
+
+Use the following commands to install prerequisites.
+
+```bash
+# assuming using cuda 11.3
+pip install -r requirements.txt
+```
+
+Use the following commands to execute training.
+
+```Bash
+#!/usr/bin/env sh
+# if you want to use real dataset, then remove --use_dummy_dataset
+# export DATA=/path/to/small-gpt-dataset.json'
+
+# run on a single node
+colossalai run --nproc_per_node=<num_gpus> train_gpt.py --config configs/<config_file> --from_torch
+
+# run on multiple nodes with slurm
+colossalai run --nproc_per_node=<num_gpus> \
+   --master_addr <hostname> \
+   --master_port <port-number> \
+   --hosts <list-of-hostname-separated-by-comma> \
+   train_gpt.py \
+   --config configs/<config_file> \
+   --from_torch \
+   --use_dummy_dataset
+
+# run on multiple nodes with slurm
+srun python \
+   train_gpt.py \
+   --config configs/<config_file> \
+   --host <master_node> \
+   --use_dummy_dataset
+
+```
+
+You can set the `<config_file>` to any file in the `configs` folder. To simply get it running, you can start with `gpt_small_zero3_pp1d.py` on a single node first. You can view the explanations in the config file regarding how to change the parallel setting.

From 438ea608f3492aa341223621e0b5d1ed537c8621 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Mon, 16 Jan 2023 15:54:36 +0800
Subject: [PATCH 189/209] update readme

---
 examples/language/gpt/titans/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/language/gpt/titans/README.md b/examples/language/gpt/titans/README.md
index 9fc26ad801db..fe1854c9ffdf 100644
--- a/examples/language/gpt/titans/README.md
+++ b/examples/language/gpt/titans/README.md
@@ -24,7 +24,7 @@ Use the following commands to execute training.
 # export DATA=/path/to/small-gpt-dataset.json'
 
 # run on a single node
-colossalai run --nproc_per_node=<num_gpus> train_gpt.py --config configs/<config_file> --from_torch
+colossalai run --nproc_per_node=<num_gpus> train_gpt.py --config configs/<config_file> --from_torch --use_dummy_dataset
 
 # run on multiple nodes with slurm
 colossalai run --nproc_per_node=<num_gpus> \

From 3a21485ead4b290b8a9590a392615e97749e36bd Mon Sep 17 00:00:00 2001
From: Jiarui Fang <fangjiarui123@gmail.com>
Date: Mon, 16 Jan 2023 15:55:41 +0800
Subject: [PATCH 190/209] [example] titans for gpt (#2484)

---
 examples/language/gpt/README.md               |  17 +-
 examples/language/gpt/titans/LICENSE          | 201 ++++++
 examples/language/gpt/titans/README.md        |  48 ++
 .../titans/configs/gpt2_small_zero3_pp1d.py   |  31 +
 .../gpt/titans/configs/gpt3_zero3_pp1d.py     |  31 +
 .../language/gpt/titans/model/__init__.py     |   3 +
 examples/language/gpt/titans/model/embed.py   | 599 ++++++++++++++++++
 examples/language/gpt/titans/model/gpt1d.py   | 349 ++++++++++
 .../gpt/titans/model/pipeline_gpt1d.py        | 322 ++++++++++
 examples/language/gpt/titans/requirements.txt |   4 +
 examples/language/gpt/titans/run.sh           |   2 +
 examples/language/gpt/titans/test_ci.sh       |   1 +
 examples/language/gpt/titans/train_gpt.py     | 148 +++++
 examples/language/palm/train.py               |   3 +-
 14 files changed, 1754 insertions(+), 5 deletions(-)
 create mode 100644 examples/language/gpt/titans/LICENSE
 create mode 100644 examples/language/gpt/titans/README.md
 create mode 100644 examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py
 create mode 100644 examples/language/gpt/titans/configs/gpt3_zero3_pp1d.py
 create mode 100644 examples/language/gpt/titans/model/__init__.py
 create mode 100644 examples/language/gpt/titans/model/embed.py
 create mode 100644 examples/language/gpt/titans/model/gpt1d.py
 create mode 100644 examples/language/gpt/titans/model/pipeline_gpt1d.py
 create mode 100644 examples/language/gpt/titans/requirements.txt
 create mode 100644 examples/language/gpt/titans/run.sh
 create mode 100644 examples/language/gpt/titans/test_ci.sh
 create mode 100644 examples/language/gpt/titans/train_gpt.py

diff --git a/examples/language/gpt/README.md b/examples/language/gpt/README.md
index 8fdf6be3b6d9..7e6acb3d399b 100644
--- a/examples/language/gpt/README.md
+++ b/examples/language/gpt/README.md
@@ -39,9 +39,15 @@ If you want to test ZeRO1 and ZeRO2 in Colossal-AI, you need to ensure Colossal-
 For simplicity, the input data is randonly generated here.
 
 ## Training
-We provide two solutions. One utilizes the hybrid parallel strategies of Gemini, DDP/ZeRO, and Tensor Parallelism.
-The other one uses Pipeline Parallelism Only.
-In the future, we are going merge them together and they can be used orthogonally to each other.
+We provide two stable solutions.
+One utilizes the Gemini to implement hybrid parallel strategies of Gemini, DDP/ZeRO, and Tensor Parallelism for a huggingface GPT model.
+The other one use [Titans](https://github.com/hpcaitech/Titans), a distributed executed model zoo maintained by ColossalAI,to implement the hybrid parallel strategies of TP + ZeRO + PP.
+
+We recommend using Gemini to qucikly run your model in a distributed manner.
+It doesn't require significant changes to the model structures, therefore you can apply it on a new model easily.
+And use Titans as an advanced weapon to pursue a more extreme performance.
+Titans has included the some typical models, such as Vit and GPT.
+However, it requires some efforts to start if facing a new model structure.
 
 ### GeminiDPP/ZeRO + Tensor Parallelism
 ```bash
@@ -56,6 +62,11 @@ The `train_gpt_demo.py` provides three distributed plans, you can choose the pla
 - Pytorch DDP
 - Pytorch ZeRO
 
+### Titans (Tensor Parallelism) + ZeRO + Pipeline Parallelism
+
+Titans provides a customized GPT model, which uses distributed operators as building blocks.
+In [./titans/README.md], we provide a hybrid parallelism of ZeRO, TP and PP.
+You can switch parallel strategies using a config file.
 
 ## Performance
 
diff --git a/examples/language/gpt/titans/LICENSE b/examples/language/gpt/titans/LICENSE
new file mode 100644
index 000000000000..261eeb9e9f8b
--- /dev/null
+++ b/examples/language/gpt/titans/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/examples/language/gpt/titans/README.md b/examples/language/gpt/titans/README.md
new file mode 100644
index 000000000000..fe1854c9ffdf
--- /dev/null
+++ b/examples/language/gpt/titans/README.md
@@ -0,0 +1,48 @@
+# Run GPT With Colossal-AI
+
+## How to Prepare Webtext Dataset
+
+You can download the preprocessed sample dataset for this demo via our [Google Drive sharing link](https://drive.google.com/file/d/1QKI6k-e2gJ7XgS8yIpgPPiMmwiBP_BPE/view?usp=sharing).
+
+
+You can also avoid dataset preparation by using `--use_dummy_dataset` during running.
+
+## Run this Demo
+
+Use the following commands to install prerequisites.
+
+```bash
+# assuming using cuda 11.3
+pip install -r requirements.txt
+```
+
+Use the following commands to execute training.
+
+```Bash
+#!/usr/bin/env sh
+# if you want to use real dataset, then remove --use_dummy_dataset
+# export DATA=/path/to/small-gpt-dataset.json'
+
+# run on a single node
+colossalai run --nproc_per_node=<num_gpus> train_gpt.py --config configs/<config_file> --from_torch --use_dummy_dataset
+
+# run on multiple nodes with slurm
+colossalai run --nproc_per_node=<num_gpus> \
+   --master_addr <hostname> \
+   --master_port <port-number> \
+   --hosts <list-of-hostname-separated-by-comma> \
+   train_gpt.py \
+   --config configs/<config_file> \
+   --from_torch \
+   --use_dummy_dataset
+
+# run on multiple nodes with slurm
+srun python \
+   train_gpt.py \
+   --config configs/<config_file> \
+   --host <master_node> \
+   --use_dummy_dataset
+
+```
+
+You can set the `<config_file>` to any file in the `configs` folder. To simply get it running, you can start with `gpt_small_zero3_pp1d.py` on a single node first. You can view the explanations in the config file regarding how to change the parallel setting.
diff --git a/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py b/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py
new file mode 100644
index 000000000000..8ef81cb0a14f
--- /dev/null
+++ b/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py
@@ -0,0 +1,31 @@
+from model import GPT2_small_pipeline_hybrid
+
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.zero.shard_utils import TensorShardStrategy
+
+BATCH_SIZE = 8
+NUM_EPOCHS = 10
+SEQ_LEN = 1024
+NUM_MICRO_BATCHES = 4
+HIDDEN_SIZE = 768
+TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LEN, HIDDEN_SIZE)
+
+# if you do no want zero, just comment out this dictionary
+zero = dict(model_config=dict(tensor_placement_policy='cuda', shard_strategy=TensorShardStrategy()),
+            optimizer_config=dict(initial_scale=2**16))
+
+optimizer = dict(
+    type=HybridAdam,
+    lr=0.00015,
+    weight_decay=1e-2,
+)
+
+model = dict(type=GPT2_small_pipeline_hybrid, checkpoint=True, num_chunks=1)
+
+# pipeline parallel: modify integer value for the number of pipeline stages
+# tensor parallel: modify size to set the tensor parallel size, usually the number of GPUs per node
+# for the current model implementation, mode can only be 1D or None
+parallel = dict(
+    pipeline=1,
+    tensor=dict(size=2, mode='1d'),
+)
diff --git a/examples/language/gpt/titans/configs/gpt3_zero3_pp1d.py b/examples/language/gpt/titans/configs/gpt3_zero3_pp1d.py
new file mode 100644
index 000000000000..9f9816b3004f
--- /dev/null
+++ b/examples/language/gpt/titans/configs/gpt3_zero3_pp1d.py
@@ -0,0 +1,31 @@
+from model import GPT3_pipeline_hybrid
+
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.zero.shard_utils import TensorShardStrategy
+
+BATCH_SIZE = 192
+NUM_EPOCHS = 60
+SEQ_LEN = 2048
+NUM_MICRO_BATCHES = 192
+HIDDEN_SIZE = 12288
+TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LEN, HIDDEN_SIZE)
+
+# if you do no want zero, just comment out this dictionary
+zero = dict(model_config=dict(tensor_placement_policy='cuda', shard_strategy=TensorShardStrategy()),
+            optimizer_config=dict(initial_scale=2**16))
+
+optimizer = dict(
+    type=HybridAdam,
+    lr=0.00015,
+    weight_decay=1e-2,
+)
+
+model = dict(type=GPT3_pipeline_hybrid, checkpoint=True, num_chunks=1)
+
+# pipeline parallel: modify integer value for the number of pipeline stages
+# tensor parallel: modify size to set the tensor parallel size, usually the number of GPUs per node
+# for the current model implementation, mode can only be 1D or None
+parallel = dict(
+    pipeline=1,
+    tensor=dict(size=2, mode='1d'),    # for the current model implementation, mode can only be 1D or None
+)
diff --git a/examples/language/gpt/titans/model/__init__.py b/examples/language/gpt/titans/model/__init__.py
new file mode 100644
index 000000000000..eec48ef893fb
--- /dev/null
+++ b/examples/language/gpt/titans/model/__init__.py
@@ -0,0 +1,3 @@
+from .embed import vocab_parallel_cross_entropy
+from .gpt1d import *
+from .pipeline_gpt1d import *
diff --git a/examples/language/gpt/titans/model/embed.py b/examples/language/gpt/titans/model/embed.py
new file mode 100644
index 000000000000..6369b9f8c5a1
--- /dev/null
+++ b/examples/language/gpt/titans/model/embed.py
@@ -0,0 +1,599 @@
+import torch
+import torch.nn.init as init
+from torch import Tensor
+from torch import distributed as dist
+from torch import nn as nn
+from torch.nn import functional as F
+from torch.nn.parameter import Parameter
+
+from colossalai.context import ParallelMode, seed
+from colossalai.core import global_context as gpc
+from colossalai.nn.layer.base_layer import ParallelLayer
+from colossalai.nn.layer.parallel_1d._utils import gather_forward_split_backward, reduce_grad, reduce_input
+from colossalai.nn.layer.parallel_1d.layers import Linear1D_Row
+from colossalai.nn.layer.utils import divide
+from colossalai.registry import LAYERS, LOSSES, MODELS
+from colossalai.utils import get_current_device
+
+
+class VocabParallelEmbedding(torch.nn.Module):
+    """Language model embeddings.
+
+    Arguments:
+        hidden_size: hidden size
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+        embedding_dropout_prob: dropout probability for embeddings
+        init_method: weight initialization method
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 vocab_size,
+                 max_sequence_length,
+                 embedding_dropout_prob,
+                 num_tokentypes=0,
+                 dtype=torch.float):
+        super(VocabParallelEmbedding, self).__init__()
+
+        self.hidden_size = hidden_size
+        self.num_tokentypes = num_tokentypes
+
+        # Word embeddings (parallel).
+        self.word_embeddings = VocabParallelEmbedding1D(vocab_size, self.hidden_size, dtype=dtype)
+        self._word_embeddings_key = 'word_embeddings'
+
+        # Position embedding (serial).
+        self.position_embeddings = torch.nn.Embedding(max_sequence_length, self.hidden_size, dtype=dtype)
+        self._position_embeddings_key = 'position_embeddings'
+        # Initialize the position embeddings.
+        # self.init_method(self.position_embeddings.weight)
+
+        # Token type embedding.
+        # Add this as an optional field that can be added through
+        # method call so we can load a pretrain model without
+        # token types and add them as needed.
+        self._tokentype_embeddings_key = 'tokentype_embeddings'
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes, self.hidden_size, dtype=dtype)
+            # Initialize the token-type embeddings.
+            # self.init_method(self.tokentype_embeddings.weight)
+        else:
+            self.tokentype_embeddings = None
+
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+
+    def zero_parameters(self):
+        """Zero out all parameters in embedding."""
+        self.word_embeddings.weight.data.fill_(0)
+        self.word_embeddings.weight.shared = True
+        self.position_embeddings.weight.data.fill_(0)
+        self.position_embeddings.weight.shared = True
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings.weight.data.fill_(0)
+            self.tokentype_embeddings.weight.shared = True
+
+    def add_tokentype_embeddings(self, num_tokentypes):
+        """Add token-type embedding. This function is provided so we can add
+        token-type embeddings in case the pretrained model does not have it.
+        This allows us to load the model normally and then add this embedding.
+        """
+        if self.tokentype_embeddings is not None:
+            raise Exception('tokentype embeddings is already initialized')
+        if torch.distributed.get_rank() == 0:
+            print('adding embedding for {} tokentypes'.format(num_tokentypes), flush=True)
+        self.num_tokentypes = num_tokentypes
+        self.tokentype_embeddings = torch.nn.Embedding(num_tokentypes, self.hidden_size)
+        # Initialize the token-type embeddings.
+        # self.init_method(self.tokentype_embeddings.weight)
+
+    def forward(self, input_ids, position_ids=None, tokentype_ids=None):
+        # Embeddings.
+        if input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        words_embeddings = self.word_embeddings(input_ids)
+
+        if position_ids is not None:
+            position_ids = position_ids.view(-1, input_shape[-1])
+        if position_ids is None:
+            position_ids = torch.arange(0, input_shape[-1] + 0, dtype=torch.long, device=get_current_device())
+            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+        position_embeddings = self.position_embeddings(position_ids)
+
+        embeddings = words_embeddings + position_embeddings
+
+        # Dropout.
+        with seed(ParallelMode.TENSOR):
+            embeddings = self.embedding_dropout(embeddings)
+        return embeddings
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='', keep_vars=False):
+        """For easy load."""
+
+        state_dict_ = {}
+        state_dict_[self._word_embeddings_key] \
+            = self.word_embeddings.state_dict(destination, prefix, keep_vars)
+        state_dict_[self._position_embeddings_key] \
+            = self.position_embeddings.state_dict(
+                destination, prefix, keep_vars)
+        if self.num_tokentypes > 0:
+            state_dict_[self._tokentype_embeddings_key] \
+                = self.tokentype_embeddings.state_dict(
+                    destination, prefix, keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        # Word embedding.
+        if self._word_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._word_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'word_embeddings' in key:
+                    state_dict_[key.split('word_embeddings.')[1]] \
+                        = state_dict[key]
+        self.word_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Position embedding.
+        if self._position_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._position_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'position_embeddings' in key:
+                    state_dict_[key.split('position_embeddings.')[1]] \
+                        = state_dict[key]
+        self.position_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Tokentype embedding.
+        if self.num_tokentypes > 0:
+            state_dict_ = {}
+            if self._tokentype_embeddings_key in state_dict:
+                state_dict_ = state_dict[self._tokentype_embeddings_key]
+            else:
+                # for backward compatibility.
+                for key in state_dict.keys():
+                    if 'tokentype_embeddings' in key:
+                        state_dict_[key.split('tokentype_embeddings.')[1]] \
+                            = state_dict[key]
+            if len(state_dict_.keys()) > 0:
+                self.tokentype_embeddings.load_state_dict(state_dict_, strict=strict)
+            else:
+                print('***WARNING*** expected tokentype embeddings in the '
+                      'checkpoint but could not find it',
+                      flush=True)
+
+
+class VocabParallelEmbedding1D(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        init_method: method to initialize weights.
+    """
+
+    def __init__(self, num_embeddings, embedding_dim, dtype=None, init_method=None):
+        super(VocabParallelEmbedding1D, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        # Set the details for compatibility.
+        self.padding_idx = None
+        self.max_norm = None
+        self.norm_type = 2.
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+        self.tensor_model_parallel_size = gpc.tensor_parallel_size
+        # Divide the weight matrix along the vocabulary dimension.
+        self.vocab_start_index, self.vocab_end_index = \
+            VocabUtility.vocab_range_from_global_vocab_size(
+                self.num_embeddings, gpc.get_local_rank(ParallelMode.PARALLEL_1D),
+                self.tensor_model_parallel_size)
+        self.num_embeddings_per_partition = self.vocab_end_index - \
+            self.vocab_start_index
+
+        # Allocate weights and initialize.
+        factory_kwargs = {'device': get_current_device(), 'dtype': dtype}
+        self.weight = Parameter(torch.empty(self.num_embeddings_per_partition, self.embedding_dim, **factory_kwargs))
+        init.uniform_(self.weight, -1, 1)
+
+    def forward(self, input_):
+        if self.tensor_model_parallel_size > 1:
+            # Build the mask.
+            input_mask = (input_ < self.vocab_start_index) | \
+                         (input_ >= self.vocab_end_index)
+            # Mask the input.
+            masked_input = input_.clone() - self.vocab_start_index
+            masked_input[input_mask] = 0
+        else:
+            masked_input = input_
+            # Get the embeddings.
+        output_parallel = F.embedding(masked_input, self.weight, self.padding_idx, self.max_norm, self.norm_type,
+                                      self.scale_grad_by_freq, self.sparse)
+        # Mask the output embedding.
+        if self.tensor_model_parallel_size > 1:
+            output_parallel[input_mask, :] = 0.0
+        # Reduce across all the model parallel GPUs.
+        output = output = reduce_input(output_parallel, ParallelMode.PARALLEL_1D)
+        return output
+
+
+@LOSSES.register_module
+class vocab_parallel_cross_entropy(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, vocab_parallel_logits, target):
+        """Helper function for the cross entropy."""
+        vocab_parallel_logits = vocab_parallel_logits[..., :-1, :].contiguous()
+        target = target[..., 1:].contiguous()
+        return _VocabParallelCrossEntropy.apply(vocab_parallel_logits.view(-1, vocab_parallel_logits.size(-1)),
+                                                target.view(-1))
+
+
+class _VocabParallelCrossEntropy(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, vocab_parallel_logits, target):
+
+        # Maximum value along vocab dimension across all GPUs.
+        logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
+        torch.distributed.all_reduce(logits_max,
+                                     op=torch.distributed.ReduceOp.MAX,
+                                     group=gpc.get_group(ParallelMode.PARALLEL_1D))
+        # Subtract the maximum value.
+        vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
+
+        # Get the partition's vocab indices
+        get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
+        partition_vocab_size = vocab_parallel_logits.size()[-1]
+        rank = gpc.get_local_rank(ParallelMode.PARALLEL_1D)
+        world_size = gpc.tensor_parallel_size
+        vocab_start_index, vocab_end_index = get_vocab_range(partition_vocab_size, rank, world_size)
+
+        # Create a mask of valid vocab ids (1 means it needs to be masked).
+        target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
+        masked_target = target.clone() - vocab_start_index
+        masked_target[target_mask] = 0
+
+        # Get predicted-logits = logits[target].
+        # For Simplicity, we convert logits to a 2-D tensor with size
+        # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
+        logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size)
+        masked_target_1d = masked_target.view(-1)
+        arange_1d = torch.arange(start=0, end=logits_2d.size()[0], device=logits_2d.device)
+        predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
+        predicted_logits_1d = predicted_logits_1d.clone().contiguous()
+        predicted_logits = predicted_logits_1d.view_as(target)
+        predicted_logits[target_mask] = 0.0
+        # All reduce is needed to get the chunks from other GPUs.
+        torch.distributed.all_reduce(predicted_logits,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=gpc.get_group(ParallelMode.PARALLEL_1D))
+
+        # Sum of exponential of logits along vocab dimension across all GPUs.
+        exp_logits = vocab_parallel_logits
+        torch.exp(vocab_parallel_logits, out=exp_logits)
+        sum_exp_logits = exp_logits.sum(dim=-1)
+        torch.distributed.all_reduce(sum_exp_logits,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=gpc.get_group(ParallelMode.PARALLEL_1D))
+
+        # Loss = log(sum(exp(logits))) - predicted-logit.
+        loss = torch.log(sum_exp_logits) - predicted_logits
+        loss = loss.mean()
+        # Store softmax, target-mask and masked-target for backward pass.
+        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
+        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
+        return loss
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        # Retreive tensors from the forward path.
+        softmax, target_mask, masked_target_1d = ctx.saved_tensors
+
+        # All the inputs have softmax as their gradient.
+        grad_input = softmax
+        # For simplicity, work with the 2D gradient.
+        partition_vocab_size = softmax.size()[-1]
+        grad_2d = grad_input.view(-1, partition_vocab_size)
+
+        # Add the gradient from matching classes.
+        arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
+        grad_2d[arange_1d, masked_target_1d] -= (1.0 - target_mask.view(-1).float())
+
+        # Finally elementwise multiplication with the output gradients.
+        grad_input.mul_(grad_output.unsqueeze(dim=-1))
+
+        return grad_input, None
+
+
+class VocabUtility:
+    """Split the vocabulary into `world_size` chunks amd return the
+        first and last index of the vocabulary belonging to the `rank`
+        partition: Note that indices in [fist, last)"""
+
+    @staticmethod
+    def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank, world_size):
+        index_f = rank * per_partition_vocab_size
+        index_l = index_f + per_partition_vocab_size
+        return index_f, index_l
+
+    @staticmethod
+    def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
+        per_partition_vocab_size = divide(global_vocab_size, world_size)
+        return VocabUtility.vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank, world_size)
+
+
+class VocabParallelGPTLMHead1D(ParallelLayer):
+    """
+    Language model head that shares the same parameters with the embedding matrix.
+    """
+
+    def __init__(self, embed=None, vocab_size=None, dtype=None, embed_dim=None):
+        super().__init__()
+        if embed is not None:
+            self.head = embed
+        else:
+            self.head = VocabParallelEmbedding1D(vocab_size, embed_dim, dtype=dtype)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = reduce_grad(x, ParallelMode.PARALLEL_1D)
+        x = F.linear(x, self.head.weight)
+        return x
+
+
+###################################
+
+
+class HiddenParallelEmbedding(torch.nn.Module):
+    """Language model embeddings.
+
+    Arguments:
+        hidden_size: hidden size
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+        embedding_dropout_prob: dropout probability for embeddings
+        init_method: weight initialization method
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        vocab_size,
+        max_sequence_length,
+        embedding_dropout_prob,
+        dtype=torch.float,
+        padding_idx: int = 0,
+        num_tokentypes=0,
+    ):
+        super(HiddenParallelEmbedding, self).__init__()
+
+        self.hidden_size = hidden_size
+        self.num_tokentypes = num_tokentypes
+
+        # Word embeddings (parallel).
+        self.word_embeddings = HiddenParallelEmbedding1D(vocab_size, hidden_size, dtype, padding_idx)
+        self._word_embeddings_key = 'word_embeddings'
+
+        # Position embedding (serial).
+        self.position_embeddings = torch.nn.Embedding(max_sequence_length, self.hidden_size)
+        self._position_embeddings_key = 'position_embeddings'
+        # Initialize the position embeddings.
+        # self.init_method(self.position_embeddings.weight)
+
+        # Token type embedding.
+        # Add this as an optional field that can be added through
+        # method call so we can load a pretrain model without
+        # token types and add them as needed.
+        self._tokentype_embeddings_key = 'tokentype_embeddings'
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes, self.hidden_size)
+            # Initialize the token-type embeddings.
+            # self.init_method(self.tokentype_embeddings.weight)
+        else:
+            self.tokentype_embeddings = None
+
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+
+    def zero_parameters(self):
+        """Zero out all parameters in embedding."""
+        self.word_embeddings.weight.data.fill_(0)
+        self.word_embeddings.weight.shared = True
+        self.position_embeddings.weight.data.fill_(0)
+        self.position_embeddings.weight.shared = True
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings.weight.data.fill_(0)
+            self.tokentype_embeddings.weight.shared = True
+
+    def add_tokentype_embeddings(self, num_tokentypes):
+        """Add token-type embedding. This function is provided so we can add
+        token-type embeddings in case the pretrained model does not have it.
+        This allows us to load the model normally and then add this embedding.
+        """
+        if self.tokentype_embeddings is not None:
+            raise Exception('tokentype embeddings is already initialized')
+        if torch.distributed.get_rank() == 0:
+            print('adding embedding for {} tokentypes'.format(num_tokentypes), flush=True)
+        self.num_tokentypes = num_tokentypes
+        self.tokentype_embeddings = torch.nn.Embedding(num_tokentypes, self.hidden_size)
+        # Initialize the token-type embeddings.
+        # self.init_method(self.tokentype_embeddings.weight)
+
+    def forward(self, input_ids, position_ids=None, tokentype_ids=None):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        words_embeddings = self.word_embeddings(input_ids)
+
+        if position_ids is not None:
+            position_ids = position_ids.view(-1, input_shape[-1])
+        if position_ids is None:
+            position_ids = torch.arange(0, input_shape[-1] + 0, dtype=torch.long, device=get_current_device())
+            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+        position_embeddings = self.position_embeddings(position_ids)
+
+        embeddings = words_embeddings + position_embeddings
+
+        # Dropout.
+        with seed(ParallelMode.TENSOR):
+            embeddings = self.embedding_dropout(embeddings)
+        return embeddings
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='', keep_vars=False):
+        """For easy load."""
+
+        state_dict_ = {}
+        state_dict_[self._word_embeddings_key] \
+            = self.word_embeddings.state_dict(destination, prefix, keep_vars)
+        state_dict_[self._position_embeddings_key] \
+            = self.position_embeddings.state_dict(
+                destination, prefix, keep_vars)
+        if self.num_tokentypes > 0:
+            state_dict_[self._tokentype_embeddings_key] \
+                = self.tokentype_embeddings.state_dict(
+                    destination, prefix, keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        # Word embedding.
+        if self._word_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._word_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'word_embeddings' in key:
+                    state_dict_[key.split('word_embeddings.')[1]] \
+                        = state_dict[key]
+        self.word_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Position embedding.
+        if self._position_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._position_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'position_embeddings' in key:
+                    state_dict_[key.split('position_embeddings.')[1]] \
+                        = state_dict[key]
+        self.position_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Tokentype embedding.
+        if self.num_tokentypes > 0:
+            state_dict_ = {}
+            if self._tokentype_embeddings_key in state_dict:
+                state_dict_ = state_dict[self._tokentype_embeddings_key]
+            else:
+                # for backward compatibility.
+                for key in state_dict.keys():
+                    if 'tokentype_embeddings' in key:
+                        state_dict_[key.split('tokentype_embeddings.')[1]] \
+                            = state_dict[key]
+            if len(state_dict_.keys()) > 0:
+                self.tokentype_embeddings.load_state_dict(state_dict_, strict=strict)
+            else:
+                print('***WARNING*** expected tokentype embeddings in the '
+                      'checkpoint but could not find it',
+                      flush=True)
+
+
+class HiddenParallelEmbedding1D(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        init_method: method to initialize weights.
+    """
+
+    def __init__(self, num_embeddings, embedding_dim, dtype=torch.float, padding_idx: int = None, init_method=None):
+        super(HiddenParallelEmbedding1D, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        embed_dim_per_partition = divide(embedding_dim, gpc.tensor_parallel_size)
+        # Set the details for compatibility.
+        self.padding_idx = padding_idx
+        self.max_norm = None
+        self.norm_type = 2.
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+
+        # Allocate weights and initialize.
+        factory_kwargs = {'device': get_current_device(), 'dtype': dtype}
+        self.weight = Parameter(torch.empty(num_embeddings, embed_dim_per_partition, **factory_kwargs))
+        init.uniform_(self.weight, -1, 1)
+
+    def forward(self, input_):
+
+        # Get the embeddings.
+        output_parallel = F.embedding(input_, self.weight, self.padding_idx, self.max_norm, self.norm_type,
+                                      self.scale_grad_by_freq, self.sparse)
+
+        # Reduce across all the model parallel GPUs.
+        output = gather_forward_split_backward(output_parallel, ParallelMode.PARALLEL_1D, dim=-1)
+        return output
+
+
+@LAYERS.register_module
+class HiddenParallelGPTLMHead1D(ParallelLayer):
+    """
+    Language model head that shares the same parameters with the embedding matrix.
+    """
+
+    def __init__(
+        self,
+        embed=None,
+        embed_dim=None,
+        vocab_size=None,
+        dtype=None,
+    ):
+        super().__init__()
+        if embed is not None:
+            self.head = embed
+            self.synced_embed = True
+        else:
+            # self.embedding = HiddenParallelEmbedding1D(vocab_size, hidden_size, dtype, padding_idx)
+            # (hidden_size/q, vocab_size)
+            self.synced_embed = False
+            self.head = Linear1D_Row(in_features=embed_dim,
+                                     out_features=vocab_size,
+                                     bias=False,
+                                     dtype=dtype,
+                                     parallel_input=False)
+
+    def forward(self, x: Tensor) -> Tensor:
+        if self.synced_embed:
+            x = F.linear(x, self.head.weight)
+        else:
+            x = self.head(x)
+
+        return x
diff --git a/examples/language/gpt/titans/model/gpt1d.py b/examples/language/gpt/titans/model/gpt1d.py
new file mode 100644
index 000000000000..2edd03606b7d
--- /dev/null
+++ b/examples/language/gpt/titans/model/gpt1d.py
@@ -0,0 +1,349 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import math
+
+import torch
+from torch import Tensor
+from torch import nn as nn
+
+from colossalai import kernel
+from colossalai import nn as col_nn
+from colossalai.core import global_context as gpc
+from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
+from colossalai.nn.layer import Linear1D_Col, Linear1D_Row
+from colossalai.nn.layer.base_layer import ParallelLayer
+from colossalai.nn.layer.utils import ACT2FN, divide
+from colossalai.utils import checkpoint
+from colossalai.utils.activation_checkpoint import checkpoint
+
+__all__ = [
+    'GPTMLP1D', 'GPTSelfAttention1D', 'GPTTransformerLayer1D', 'FusedGPTSelfAttention1D', 'FusedGPTTransformerLayer1D'
+]
+
+
+class GPTMLP1D(ParallelLayer):
+
+    def __init__(
+        self,
+        in_features: int,
+        mlp_ratio: int,
+        act_func: str = 'gelu',
+        dropout_prob: float = 0.,
+        dtype=None,
+        checkpoint: bool = False,
+        skip_bias_add: bool = False,
+    ):
+        super().__init__()
+
+        self.in_features = in_features
+        self.mlp_ratio = mlp_ratio
+        self.checkpoint = checkpoint
+        self.skip_bias_add = skip_bias_add
+
+        self.act = ACT2FN[act_func]
+        skip_dense_1_add_bias = False
+
+        # Project to mlp_ratio * h.
+        self.dense_1 = Linear1D_Col(
+            self.in_features,
+            int(self.mlp_ratio * self.in_features),
+            dtype=dtype,
+            gather_output=False,
+            skip_bias_add=skip_dense_1_add_bias,
+        )
+
+        # Project back to h.
+        self.dense_2 = Linear1D_Row(
+            int(self.mlp_ratio * self.in_features),
+            self.in_features,
+            dtype=dtype,
+            parallel_input=True,
+        )
+
+        self.dropout = col_nn.Dropout(dropout_prob)
+
+    def _forward(self, hidden_states: Tensor) -> Tensor:
+        intermediate_output = self.dense_1(hidden_states)
+        intermediate_output = self.act(intermediate_output)
+
+        output = self.dense_2(intermediate_output)
+        output = self.dropout(output)
+        return output
+
+    def _checkpoint_forward(self, hidden_states: Tensor) -> Tensor:
+        return checkpoint(self._forward, False, hidden_states)
+
+    def forward(self, hidden_states: Tensor) -> Tensor:
+        if self.checkpoint:
+            return self._checkpoint_forward(hidden_states)
+        else:
+            return self._forward(hidden_states)
+
+
+class GenericGPTSelfAttention1D(ParallelLayer):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_attention_heads: int,
+        attention_dropout_prob: float,
+        hidden_dropout_prob: float,
+        dtype=None,
+        checkpoint: bool = False,
+        max_position_embeddings=1024,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.attention_head_size = divide(hidden_size, num_attention_heads)
+        self.num_attention_heads_per_partition = divide(num_attention_heads, gpc.tensor_parallel_size)
+        self.hidden_size_per_partition = divide(hidden_size, gpc.tensor_parallel_size)
+        self.checkpoint = checkpoint
+        self.query_key_value = Linear1D_Col(
+            hidden_size,
+            3 * hidden_size,
+            dtype=dtype,
+        )
+        self.attention_dropout = col_nn.Dropout(attention_dropout_prob)
+        self.dense = Linear1D_Row(
+            hidden_size,
+            hidden_size,
+            dtype=dtype,
+            parallel_input=True,
+        )
+        self.dropout = col_nn.Dropout(hidden_dropout_prob)
+
+    def softmax_forward(self, attention_scores, attention_mask, query_layer, key_layer):
+        raise NotImplementedError
+
+    def _forward(self, hidden_states: Tensor, attention_mask=None) -> Tensor:
+        query_key_value = self.query_key_value(hidden_states)
+        new_qkv_shape = query_key_value.shape[:-1] + \
+            (self.num_attention_heads_per_partition, 3 * self.attention_head_size)
+        query_key_value = query_key_value.view(new_qkv_shape)
+        query_key_value = query_key_value.permute((0, 2, 1, 3))
+        query_layer, key_layer, value_layer = torch.chunk(query_key_value, 3, dim=-1)
+
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = self.softmax_forward(attention_scores, attention_mask, query_layer, key_layer)
+
+        attention_scores = attention_scores.type(value_layer.dtype)
+
+        attention_probs = self.attention_dropout(attention_scores)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.transpose(1, 2)
+        new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+        output = self.dense(context_layer)
+        output = self.dropout(output)
+
+        return output
+
+    def _checkpoint_forward(self, hidden_states: Tensor, attention_mask=None) -> Tensor:
+        return checkpoint(self._forward, False, hidden_states, attention_mask)
+
+    def forward(self, hidden_states: Tensor, attention_mask=None) -> Tensor:
+        if self.checkpoint:
+            return self._checkpoint_forward(hidden_states, attention_mask)
+        else:
+            return self._forward(hidden_states, attention_mask)
+
+
+class GPTSelfAttention1D(GenericGPTSelfAttention1D):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_attention_heads: int,
+                 attention_dropout_prob: float,
+                 hidden_dropout_prob: float,
+                 dtype=None,
+                 checkpoint: bool = False,
+                 max_position_embeddings=1024):
+        super().__init__(hidden_size,
+                         num_attention_heads,
+                         attention_dropout_prob,
+                         hidden_dropout_prob,
+                         dtype=dtype,
+                         checkpoint=checkpoint,
+                         max_position_embeddings=max_position_embeddings)
+        self.softmax = nn.Softmax(dim=-1)
+        max_positions = max_position_embeddings
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones((max_positions, max_positions),
+                                  dtype=torch.uint8)).view(1, 1, max_positions, max_positions),
+        )
+        self.register_buffer("masked_bias", torch.tensor(-1e4))
+
+    def softmax_forward(self, attention_scores, attention_mask, query_layer, key_layer):
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # causal mask
+        query_length, key_length = query_layer.size(-2), key_layer.size(-2)
+        causal_mask = self.bias[:, :, key_length - query_length:key_length, :key_length].bool()
+        attention_scores = torch.where(causal_mask, attention_scores, self.masked_bias.to(attention_scores))
+        if attention_mask is not None:
+            # Apply the attention mask
+            attention_scores = attention_scores + attention_mask
+        attention_scores = self.softmax(attention_scores)
+        return attention_scores
+
+
+class FusedGPTSelfAttention1D(GenericGPTSelfAttention1D):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_attention_heads: int,
+                 attention_dropout_prob: float,
+                 hidden_dropout_prob: float,
+                 dtype=None,
+                 checkpoint: bool = False,
+                 max_position_embeddings=1024):
+        super().__init__(hidden_size,
+                         num_attention_heads,
+                         attention_dropout_prob,
+                         hidden_dropout_prob,
+                         dtype=dtype,
+                         checkpoint=checkpoint,
+                         max_position_embeddings=max_position_embeddings)
+        self.softmax = kernel.FusedScaleMaskSoftmax(input_in_fp16=True,
+                                                    input_in_bf16=False,
+                                                    attn_mask_type=AttnMaskType.causal,
+                                                    scaled_masked_softmax_fusion=True,
+                                                    mask_func=None,
+                                                    softmax_in_fp32=True,
+                                                    scale=math.sqrt(self.attention_head_size))
+
+    def softmax_forward(self, attention_scores, attention_mask, query_layer, key_layer):
+        return self.softmax(attention_scores, attention_mask)
+
+
+class GenericGPTTransformerLayer1D(ParallelLayer):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_attention_heads: int,
+                 act_func: str = 'gelu',
+                 mlp_ratio: float = 4.0,
+                 attention_dropout_prob: float = 0.,
+                 hidden_dropout_prob: float = 0.,
+                 dtype=None,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 1e-5,
+                 apply_post_layer_norm: bool = False,
+                 attention=None,
+                 layer_norm=None):
+        super().__init__()
+        self.checkpoint = checkpoint
+        self.dtype = dtype
+        self.norm1 = layer_norm(hidden_size, eps=layer_norm_epsilon)
+        self.apply_post_layer_norm = apply_post_layer_norm
+        self.attention = attention(
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            attention_dropout_prob=attention_dropout_prob,
+            hidden_dropout_prob=hidden_dropout_prob,
+            dtype=dtype,
+            max_position_embeddings=max_position_embeddings,
+            checkpoint=False,
+        )
+
+        self.norm2 = layer_norm(hidden_size, eps=layer_norm_epsilon)
+        self.mlp = GPTMLP1D(
+            in_features=hidden_size,
+            dropout_prob=hidden_dropout_prob,
+            act_func=act_func,
+            mlp_ratio=mlp_ratio,
+            dtype=dtype,
+            checkpoint=False,
+        )
+
+    def _forward(self, hidden_states, attention_mask) -> Tensor:
+        if not self.apply_post_layer_norm:
+            residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        if self.apply_post_layer_norm:
+            residual = hidden_states
+        attention_output = self.attention(hidden_states, attention_mask)
+        hidden_states = residual + attention_output
+
+        if not self.apply_post_layer_norm:
+            residual = hidden_states
+        hidden_states = self.norm2(hidden_states)
+        if self.apply_post_layer_norm:
+            residual = hidden_states
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + feed_forward_hidden_states
+
+        output = (hidden_states, attention_mask)
+        return output
+
+    def forward(self, hidden_states, attention_mask):
+        if self.checkpoint:
+            return checkpoint(self._forward, False, hidden_states, attention_mask)
+        else:
+            return self._forward(hidden_states, attention_mask)
+
+
+class GPTTransformerLayer1D(GenericGPTTransformerLayer1D):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_attention_heads: int,
+                 act_func: str = 'gelu',
+                 mlp_ratio: float = 4,
+                 attention_dropout_prob: float = 0,
+                 hidden_dropout_prob: float = 0,
+                 dtype=None,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 0.00001,
+                 apply_post_layer_norm: bool = False):
+        attention = GPTSelfAttention1D
+        layer_norm = nn.LayerNorm
+        super().__init__(hidden_size,
+                         num_attention_heads,
+                         act_func=act_func,
+                         mlp_ratio=mlp_ratio,
+                         attention_dropout_prob=attention_dropout_prob,
+                         hidden_dropout_prob=hidden_dropout_prob,
+                         dtype=dtype,
+                         checkpoint=checkpoint,
+                         max_position_embeddings=max_position_embeddings,
+                         layer_norm_epsilon=layer_norm_epsilon,
+                         apply_post_layer_norm=apply_post_layer_norm,
+                         attention=attention,
+                         layer_norm=layer_norm)
+
+
+class FusedGPTTransformerLayer1D(GenericGPTTransformerLayer1D):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_attention_heads: int,
+                 act_func: str = 'gelu',
+                 mlp_ratio: float = 4,
+                 attention_dropout_prob: float = 0,
+                 hidden_dropout_prob: float = 0,
+                 dtype=None,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 0.00001,
+                 apply_post_layer_norm: bool = False):
+        attention = FusedGPTSelfAttention1D
+        layer_norm = kernel.LayerNorm
+        super().__init__(hidden_size,
+                         num_attention_heads,
+                         act_func=act_func,
+                         mlp_ratio=mlp_ratio,
+                         attention_dropout_prob=attention_dropout_prob,
+                         hidden_dropout_prob=hidden_dropout_prob,
+                         dtype=dtype,
+                         checkpoint=checkpoint,
+                         max_position_embeddings=max_position_embeddings,
+                         layer_norm_epsilon=layer_norm_epsilon,
+                         apply_post_layer_norm=apply_post_layer_norm,
+                         attention=attention,
+                         layer_norm=layer_norm)
diff --git a/examples/language/gpt/titans/model/pipeline_gpt1d.py b/examples/language/gpt/titans/model/pipeline_gpt1d.py
new file mode 100644
index 000000000000..30180285bc70
--- /dev/null
+++ b/examples/language/gpt/titans/model/pipeline_gpt1d.py
@@ -0,0 +1,322 @@
+import inspect
+
+# import model_zoo.gpt.gpt as col_gpt
+import titans.model.gpt.gpt as col_gpt
+import torch
+import torch.nn as nn
+
+from colossalai import kernel
+from colossalai import nn as col_nn
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import get_dist_logger
+from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
+from colossalai.pipeline.utils import partition_uniform
+
+from .embed import HiddenParallelEmbedding, HiddenParallelGPTLMHead1D, VocabParallelEmbedding, VocabParallelGPTLMHead1D
+from .gpt1d import FusedGPTTransformerLayer1D, GPTTransformerLayer1D
+
+__all__ = [
+    'GPT2_small_pipeline_1D',
+    'GPT2_exlarge_pipeline_1D',
+    'GPT3_pipeline_1D',
+    'GPT2_exlarge_pipeline_hybrid',
+    'GPT2_small_pipeline_hybrid',
+    'GPT3_pipeline_hybrid',
+]
+
+
+class GenericPipelineGPT(nn.Module):
+
+    def __init__(self, embedding=None, blocks=None, norm=None, head=None) -> None:
+        super().__init__()
+        self.embedding = embedding
+        self.blocks = blocks
+        self.norm = norm
+        self.head = head
+        assert blocks is not None
+        if norm is not None or head is not None:
+            assert norm is not None and head is not None
+
+    def forward(self, hidden_states=None, input_ids=None, attention_mask=None):
+        if self.embedding is not None:
+            hidden_states = self.embedding(input_ids=input_ids)
+        batch_size = hidden_states.shape[0]
+        attention_mask = attention_mask.view(batch_size, -1)
+        attention_mask = attention_mask[:, None, None, :]
+        attention_mask = attention_mask.to(dtype=hidden_states.dtype)    # fp16 compatibility
+        attention_mask = (1.0 - attention_mask) * -10000.0
+        for block in self.blocks:
+            hidden_states, attention_mask = block(hidden_states, attention_mask)
+        if self.norm is not None:
+            hidden_states = self.head(self.norm(hidden_states))
+        return hidden_states
+
+
+class PipelineGPT1D(GenericPipelineGPT):
+
+    def __init__(self,
+                 num_layers: int = 12,
+                 hidden_size: int = 768,
+                 num_attention_heads: int = 12,
+                 vocab_size: int = 50304,
+                 embed_drop_rate: float = 0.,
+                 act_func: str = 'gelu',
+                 mlp_ratio: int = 4.0,
+                 attn_drop_rate: float = 0.,
+                 drop_rate: float = 0.,
+                 dtype: torch.dtype = torch.float,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 1e-5,
+                 apply_post_layer_norm: bool = False,
+                 first: bool = False,
+                 last: bool = False,
+                 embed_split_hidden=False):
+        embedding = None
+        norm = None
+        head = None
+        embed_cls = VocabParallelEmbedding
+        head_cls = VocabParallelGPTLMHead1D
+        if embed_split_hidden:
+            embed_cls = HiddenParallelEmbedding
+            head_cls = HiddenParallelGPTLMHead1D
+        if first:
+            embedding = embed_cls(hidden_size, vocab_size, max_position_embeddings, embed_drop_rate, dtype=dtype)
+        blocks = nn.ModuleList([
+            GPTTransformerLayer1D(hidden_size,
+                                  num_attention_heads,
+                                  act_func=act_func,
+                                  mlp_ratio=mlp_ratio,
+                                  attention_dropout_prob=attn_drop_rate,
+                                  hidden_dropout_prob=drop_rate,
+                                  dtype=dtype,
+                                  checkpoint=checkpoint,
+                                  max_position_embeddings=max_position_embeddings,
+                                  layer_norm_epsilon=layer_norm_epsilon,
+                                  apply_post_layer_norm=apply_post_layer_norm) for _ in range(num_layers)
+        ])
+        if last:
+            norm = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
+            head = head_cls(vocab_size=vocab_size, embed_dim=hidden_size, dtype=dtype)
+        super().__init__(embedding=embedding, blocks=blocks, norm=norm, head=head)
+
+
+class FusedPipelineGPT1D(GenericPipelineGPT):
+
+    def __init__(self,
+                 num_layers: int = 12,
+                 hidden_size: int = 768,
+                 num_attention_heads: int = 12,
+                 vocab_size: int = 50304,
+                 embed_drop_rate: float = 0.,
+                 act_func: str = 'gelu',
+                 mlp_ratio: int = 4.0,
+                 attn_drop_rate: float = 0.,
+                 drop_rate: float = 0.,
+                 dtype: torch.dtype = torch.float,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 1e-5,
+                 apply_post_layer_norm: bool = False,
+                 first: bool = False,
+                 last: bool = False,
+                 embed_split_hidden=False):
+        embedding = None
+        norm = None
+        head = None
+        embed_cls = VocabParallelEmbedding
+        head_cls = VocabParallelGPTLMHead1D
+        if embed_split_hidden:
+            embed_cls = HiddenParallelEmbedding
+            head_cls = HiddenParallelGPTLMHead1D
+        if first:
+            embedding = embed_cls(hidden_size, vocab_size, max_position_embeddings, embed_drop_rate, dtype=dtype)
+        blocks = nn.ModuleList([
+            FusedGPTTransformerLayer1D(hidden_size,
+                                       num_attention_heads,
+                                       act_func=act_func,
+                                       mlp_ratio=mlp_ratio,
+                                       attention_dropout_prob=attn_drop_rate,
+                                       hidden_dropout_prob=drop_rate,
+                                       dtype=dtype,
+                                       checkpoint=checkpoint,
+                                       max_position_embeddings=max_position_embeddings,
+                                       layer_norm_epsilon=layer_norm_epsilon,
+                                       apply_post_layer_norm=apply_post_layer_norm) for _ in range(num_layers)
+        ])
+        if last:
+            norm = kernel.LayerNorm(hidden_size, eps=layer_norm_epsilon)
+            head = head_cls(vocab_size=vocab_size, embed_dim=hidden_size, dtype=dtype)
+        super().__init__(embedding=embedding, blocks=blocks, norm=norm, head=head)
+
+    def forward(self, hidden_states=None, input_ids=None, attention_mask=None):
+        if self.embedding is not None:
+            hidden_states = self.embedding(input_ids=input_ids)
+        attention_mask = attention_mask.to(dtype=hidden_states.dtype)    # fp16 compatibility
+        for block in self.blocks:
+            hidden_states, attention_mask = block(hidden_states, attention_mask)
+        if self.norm is not None:
+            hidden_states = self.head(self.norm(hidden_states))
+        return hidden_states
+
+
+class PipelineGPTHybrid(GenericPipelineGPT):
+
+    def __init__(self,
+                 num_layers: int = 12,
+                 hidden_size: int = 768,
+                 num_attention_heads: int = 12,
+                 vocab_size: int = 50304,
+                 embed_drop_rate: float = 0.,
+                 act_func: str = 'gelu',
+                 mlp_ratio: int = 4,
+                 attn_drop_rate: float = 0.,
+                 drop_rate: float = 0.,
+                 dtype: torch.dtype = torch.float,
+                 checkpoint: bool = False,
+                 max_position_embeddings: int = 1024,
+                 layer_norm_epsilon: float = 1e-5,
+                 apply_post_layer_norm: bool = False,
+                 first: bool = False,
+                 last: bool = False,
+                 embed_split_hidden=False):
+        embedding = None
+        norm = None
+        head = None
+        if first:
+            embedding = col_gpt.GPTEmbedding(hidden_size,
+                                             vocab_size,
+                                             max_position_embeddings,
+                                             dropout=embed_drop_rate,
+                                             dtype=dtype)
+        blocks = nn.ModuleList([
+            col_gpt.GPTBlock(hidden_size,
+                             num_attention_heads,
+                             mlp_ratio=mlp_ratio,
+                             attention_dropout=attn_drop_rate,
+                             dropout=drop_rate,
+                             dtype=dtype,
+                             checkpoint=checkpoint,
+                             activation=nn.functional.gelu) for _ in range(num_layers)
+        ])
+        if last:
+            norm = col_nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
+            # head = col_gpt.GPTLMHead(vocab_size=vocab_size,
+            #                          hidden_size=hidden_size,
+            #                          dtype=dtype,
+            #                          bias=False)
+            head = col_nn.Classifier(hidden_size, vocab_size, dtype=dtype, bias=False)
+        super().__init__(embedding=embedding, blocks=blocks, norm=norm, head=head)
+
+
+def _filter_kwargs(func, kwargs):
+    sig = inspect.signature(func)
+    return {k: v for k, v in kwargs.items() if k in sig.parameters}
+
+
+def _build_generic_gpt_pipeline_1d(module_cls, num_layers, num_chunks, device=torch.device('cuda'), **kwargs):
+    logger = get_dist_logger()
+
+    if gpc.is_initialized(ParallelMode.PIPELINE):
+        pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE)
+        pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
+    else:
+        pipeline_size = 1
+        pipeline_rank = 0
+    rank = gpc.get_global_rank()
+
+    if pipeline_size > 1:
+        wrapper = PipelineSharedModuleWrapper([0, pipeline_size - 1])
+    else:
+        wrapper = None
+    parts = partition_uniform(num_layers, pipeline_size, num_chunks)[pipeline_rank]
+    models = []
+    for start, end in parts:
+        kwargs['num_layers'] = end - start
+        kwargs['first'] = start == 0
+        kwargs['last'] = end == num_layers
+        logger.info(f'Rank{rank} build layer {start}-{end}, {end-start}/{num_layers} layers')
+        chunk = module_cls(**_filter_kwargs(module_cls.__init__, kwargs)).to(device)
+
+        if wrapper is not None:
+            if start == 0:
+                wrapper.register_module(chunk.embedding.word_embeddings)
+            elif end == num_layers:
+                wrapper.register_module(chunk.head)
+        models.append(chunk)
+    if len(models) == 1:
+        model = models[0]
+    else:
+        model = nn.ModuleList(models)
+
+    numel = 0
+    for _, param in model.named_parameters(recurse=True):
+        numel += param.numel()
+    logger.info(f'Rank{rank}/{pipeline_rank} model size = {numel * 2 / 1e9} GB')
+    return model
+
+
+def _build_gpt_pipeline_1d(num_layers, num_chunks, device=torch.device('cuda'), fused=False, **kwargs):
+    model = FusedPipelineGPT1D if fused else PipelineGPT1D
+    return _build_generic_gpt_pipeline_1d(model, num_layers, num_chunks, device, **kwargs)
+
+
+def _build_gpt_pipeline_hybrid(num_layers, num_chunks, device=torch.device('cuda'), **kwargs):
+    return _build_generic_gpt_pipeline_1d(PipelineGPTHybrid, num_layers, num_chunks, device, **kwargs)
+
+
+def GPT2_small_pipeline_1D(num_chunks=1, checkpoint=False, dtype=torch.float, embed_split_hidden=False, fused=False):
+    cfg = dict(hidden_size=768,
+               num_attention_heads=12,
+               checkpoint=checkpoint,
+               dtype=dtype,
+               embed_split_hidden=embed_split_hidden)
+    return _build_gpt_pipeline_1d(12, num_chunks, fused=fused, **cfg)
+
+
+def GPT2_exlarge_pipeline_1D(num_chunks=1, checkpoint=False, dtype=torch.float, embed_split_hidden=False, fused=False):
+    cfg = dict(hidden_size=1600,
+               num_attention_heads=32,
+               checkpoint=checkpoint,
+               dtype=dtype,
+               embed_split_hidden=embed_split_hidden)
+    return _build_gpt_pipeline_1d(48, num_chunks, fused=fused, **cfg)
+
+
+def GPT3_pipeline_1D(num_chunks=1, checkpoint=False, dtype=torch.float, embed_split_hidden=False, fused=False):
+    cfg = dict(hidden_size=12288,
+               num_attention_heads=96,
+               checkpoint=checkpoint,
+               max_position_embeddings=2048,
+               dtype=dtype,
+               embed_split_hidden=embed_split_hidden)
+    return _build_gpt_pipeline_1d(96, num_chunks, fused=fused, **cfg)
+
+
+def GPT2_exlarge_pipeline_hybrid(num_chunks=1, checkpoint=False, dtype=torch.float, embed_split_hidden=False):
+    cfg = dict(hidden_size=1600,
+               num_attention_heads=32,
+               checkpoint=checkpoint,
+               dtype=dtype,
+               embed_split_hidden=embed_split_hidden)
+    return _build_gpt_pipeline_hybrid(48, num_chunks, **cfg)
+
+
+def GPT2_small_pipeline_hybrid(num_chunks=1, checkpoint=False, dtype=torch.float, embed_split_hidden=False):
+    cfg = dict(hidden_size=768,
+               num_attention_heads=12,
+               checkpoint=checkpoint,
+               dtype=dtype,
+               embed_split_hidden=embed_split_hidden)
+    return _build_gpt_pipeline_hybrid(12, num_chunks, **cfg)
+
+
+def GPT3_pipeline_hybrid(num_chunks=1, checkpoint=False, dtype=torch.float, embed_split_hidden=False):
+    cfg = dict(hidden_size=12288,
+               num_attention_heads=96,
+               checkpoint=checkpoint,
+               max_position_embeddings=2048,
+               dtype=dtype,
+               embed_split_hidden=embed_split_hidden)
+    return _build_gpt_pipeline_hybrid(96, num_chunks, **cfg)
diff --git a/examples/language/gpt/titans/requirements.txt b/examples/language/gpt/titans/requirements.txt
new file mode 100644
index 000000000000..64ff7a4abcd8
--- /dev/null
+++ b/examples/language/gpt/titans/requirements.txt
@@ -0,0 +1,4 @@
+torch==1.12.1
+titans==0.0.7
+colossalai==0.2.0+torch1.12cu11.3
+-f https://release.colossalai.org
diff --git a/examples/language/gpt/titans/run.sh b/examples/language/gpt/titans/run.sh
new file mode 100644
index 000000000000..157bd377aa34
--- /dev/null
+++ b/examples/language/gpt/titans/run.sh
@@ -0,0 +1,2 @@
+export DATA=/data/scratch/gpt_data/small-gpt-dataset.json
+colossalai run --nproc_per_node=4 train_gpt.py --config ./configs/gpt2_small_zero3_pp1d.py --from_torch
diff --git a/examples/language/gpt/titans/test_ci.sh b/examples/language/gpt/titans/test_ci.sh
new file mode 100644
index 000000000000..7cb24c1a4082
--- /dev/null
+++ b/examples/language/gpt/titans/test_ci.sh
@@ -0,0 +1 @@
+colossalai run --nproc_per_node=4 train_gpt.py --config ./configs/gpt2_small_zero3_pp1d.py --from_torch --use_dummy_dataset
diff --git a/examples/language/gpt/titans/train_gpt.py b/examples/language/gpt/titans/train_gpt.py
new file mode 100644
index 000000000000..1380b4b3a7da
--- /dev/null
+++ b/examples/language/gpt/titans/train_gpt.py
@@ -0,0 +1,148 @@
+import contextlib
+import os
+
+import torch
+import torch.nn as nn
+from titans.model.gpt import GPTLMLoss
+
+import colossalai
+import colossalai.utils as utils
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.nn import LinearWarmupLR
+from colossalai.trainer import Trainer, hooks
+from colossalai.utils import colo_set_process_memory_fraction, is_using_pp
+from colossalai.utils.timer import MultiTimer
+from colossalai.zero.init_ctx import ZeroInitContext
+
+
+def calc_local_model_size(model: torch.nn.Module):
+    numel_per_device = 0
+    for p in model.parameters():
+        numel_per_device += p.numel()
+    return numel_per_device
+
+
+VOCAB_SIZE = 50257
+
+
+def main():
+    parser = colossalai.get_default_parser()
+    parser.add_argument('--from_torch', default=False, action='store_true')
+    parser.add_argument('--use_dummy_dataset', default=True, action='store_true')
+    args = parser.parse_args()
+    disable_existing_loggers()
+    if args.from_torch:
+        colossalai.launch_from_torch(config=args.config)
+    else:
+        colossalai.launch_from_slurm(config=args.config, host=args.host, port=29500, seed=42)
+    logger = get_dist_logger()
+
+    if not args.use_dummy_dataset:
+        data_path = os.environ['DATA']
+        logger.info(f'Build data loader from path {data_path}', ranks=[0])
+        from dataset.webtext import WebtextDataset
+        train_ds = WebtextDataset(os.environ['DATA'], seq_len=gpc.config.SEQ_LEN)
+        train_dataloader = utils.get_dataloader(train_ds,
+                                                seed=42,
+                                                batch_size=gpc.config.BATCH_SIZE,
+                                                pin_memory=True,
+                                                shuffle=True,
+                                                drop_last=True)
+    else:
+        # build a dummy train_dataloader
+        logger.info('Build data loader using dummy data', ranks=[0])
+
+        def get_data(batch_size, seq_len, vocab_size):
+            input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device())
+            attention_mask = torch.ones_like(input_ids)
+            return input_ids, attention_mask
+
+        # 10 iterations
+        input_ids, attn_mask = get_data(gpc.config.BATCH_SIZE * 10, gpc.config.SEQ_LEN, VOCAB_SIZE)
+        from torch.utils.data import DataLoader, Dataset
+
+        class TextSamplerDataset(Dataset):
+
+            def __init__(self, data, seq_len):
+                super().__init__()
+                self.data = data
+                self.seq_len = seq_len
+
+            def __getitem__(self, index):
+                rand_start = torch.randint(0, self.data.size(0) - self.seq_len, (1,))
+                full_seq = self.data[rand_start:rand_start + self.seq_len + 1].long()
+                return full_seq.cuda()
+
+            def __len__(self):
+                return self.data.size(0) // self.seq_len
+
+        def cycle(loader):
+            while True:
+                for data in loader:
+                    yield data
+
+        train_dataset = TextSamplerDataset(input_ids, gpc.config.SEQ_LEN)
+        train_dataloader = DataLoader(train_dataset, batch_size=gpc.config.BATCH_SIZE)
+
+    logger.info('Build model', ranks=[0])
+    use_pipeline = is_using_pp()
+    use_interleaved = hasattr(gpc.config.model, 'num_chunks')
+    use_zero3 = hasattr(gpc.config, 'zero')
+    ctx = contextlib.nullcontext()
+    if use_zero3:
+        ctx = ZeroInitContext(target_device=torch.cuda.current_device(),
+                              shard_strategy=gpc.config.zero.model_config.shard_strategy,
+                              shard_param=True)
+    with ctx:
+        model = gpc.config.model.pop('type')(**gpc.config.model)
+    if use_pipeline and use_interleaved and not isinstance(model, nn.ModuleList):
+        model = nn.ModuleList([model])
+
+    if use_zero3:
+        numel = ctx.model_numel_tensor.item()
+    else:
+        numel = calc_local_model_size(model)
+
+    tflop = numel * gpc.config.BATCH_SIZE * gpc.config.SEQ_LEN \
+        * gpc.get_world_size(ParallelMode.MODEL) * gpc.get_world_size(ParallelMode.DATA) * 8 / (1024 ** 4)
+
+    criterion = getattr(gpc.config, 'loss_fn', None)
+    if criterion is not None:
+        criterion = criterion.type()
+    else:
+        criterion = GPTLMLoss()
+    logger.info('Build optimizer', ranks=[0])
+    optimizer = gpc.config.optimizer.pop('type')(model.parameters(), **gpc.config.optimizer)
+    lr_scheduler = LinearWarmupLR(optimizer, total_steps=gpc.config.NUM_EPOCHS, warmup_steps=5)
+    engine, train_dataloader, _, lr_scheduler = colossalai.initialize(model,
+                                                                      optimizer,
+                                                                      criterion,
+                                                                      train_dataloader=train_dataloader,
+                                                                      lr_scheduler=lr_scheduler)
+    global_batch_size = gpc.config.BATCH_SIZE * \
+        gpc.get_world_size(ParallelMode.DATA) * getattr(gpc.config, "gradient_accumulation", 1)
+    logger.info(f'Init done, global batch size = {global_batch_size}', ranks=[0])
+    timier = MultiTimer()
+    trainer = Trainer(engine=engine, logger=logger, timer=timier)
+    hook_list = [
+        hooks.LossHook(),
+        hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True),
+        hooks.LogMetricByEpochHook(logger),
+        hooks.ThroughputHook(ignored_steps=10, tflop_per_step=tflop),
+        hooks.LogMetricByStepHook(),
+        hooks.LogMemoryByEpochHook(logger),
+    # hooks.LogMemoryByEpochHook(logger),
+    # hooks.LogTimingByEpochHook(timer, logger),
+    ]
+    trainer.fit(train_dataloader=train_dataloader,
+                epochs=gpc.config.NUM_EPOCHS,
+                test_interval=1,
+                hooks=hook_list,
+                display_progress=True,
+                return_output_label=False)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/language/palm/train.py b/examples/language/palm/train.py
index a334ea9511fb..2f012780da77 100644
--- a/examples/language/palm/train.py
+++ b/examples/language/palm/train.py
@@ -11,13 +11,12 @@
 from packaging import version
 from palm_pytorch import PaLM
 from palm_pytorch.autoregressive_wrapper import AutoregressiveWrapper
-from torch.nn import functional as F
 from torch.utils.data import DataLoader, Dataset
 
 import colossalai
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.nn.optimizer.gemini_optimizer import GeminiAdamOptimizer
-from colossalai.nn.parallel import GeminiDDP, ZeroDDP
+from colossalai.nn.parallel import ZeroDDP
 from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
 from colossalai.utils import MultiTimer, get_current_device
 from colossalai.utils.model.colo_init_context import ColoInitContext

From 67e1912b59760a54b5dc00ff6f7d9b47a309916c Mon Sep 17 00:00:00 2001
From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com>
Date: Mon, 16 Jan 2023 16:25:13 +0800
Subject: [PATCH 191/209] [autoparallel] support origin activation ckpt on
 autoprallel system (#2468)

---
 .../passes/runtime_apply_pass.py              | 33 +++++++++
 .../passes/runtime_preparation_pass.py        |  2 +
 .../auto_parallel/tensor_shard/initialize.py  | 11 +--
 .../test_tensor_shard/test_checkpoint.py      | 70 +++++++++++++++++++
 4 files changed, 111 insertions(+), 5 deletions(-)
 create mode 100644 tests/test_auto_parallel/test_tensor_shard/test_checkpoint.py

diff --git a/colossalai/auto_parallel/passes/runtime_apply_pass.py b/colossalai/auto_parallel/passes/runtime_apply_pass.py
index 7f2aac42b7f8..9d83f105748b 100644
--- a/colossalai/auto_parallel/passes/runtime_apply_pass.py
+++ b/colossalai/auto_parallel/passes/runtime_apply_pass.py
@@ -128,6 +128,8 @@ def _shape_consistency_apply(gm: torch.fx.GraphModule):
                                                                    runtime_apply,
                                                                    args=(node, origin_dict_node, input_dict_node,
                                                                          node_to_index_dict[node], user_node_index))
+            if 'activation_checkpoint' in user_node.meta:
+                shape_consistency_node.meta['activation_checkpoint'] = user_node.meta['activation_checkpoint']
 
             new_args = list(user_node.args)
             new_kwargs = dict(user_node.kwargs)
@@ -208,6 +210,37 @@ def _comm_spec_apply(gm: torch.fx.GraphModule):
                         # substitute the origin node with comm_spec_apply_node
                         new_kwargs[str(node)] = comm_spec_apply_node
                         user.kwargs = new_kwargs
+
+            if 'activation_checkpoint' in node.meta:
+                comm_spec_apply_node.meta['activation_checkpoint'] = node.meta['activation_checkpoint']
+
+    return gm
+
+
+def _act_annotataion_pass(gm: torch.fx.GraphModule):
+    """
+    This pass is used to add the act annotation to the new inserted nodes.
+    """
+    mod_graph = gm.graph
+    nodes = tuple(mod_graph.nodes)
+
+    for node in nodes:
+        if not hasattr(node.meta, 'activation_checkpoint'):
+            from .runtime_preparation_pass import size_processing
+
+            user_act_annotation = -1
+            input_act_annotation = -1
+            for user_node in node.users.keys():
+                if 'activation_checkpoint' in user_node.meta:
+                    user_act_annotation = user_node.meta['activation_checkpoint']
+                    break
+            for input_node in node._input_nodes.keys():
+                if 'activation_checkpoint' in input_node.meta:
+                    input_act_annotation = input_node.meta['activation_checkpoint']
+                    break
+            if user_act_annotation == input_act_annotation and user_act_annotation != -1:
+                node.meta['activation_checkpoint'] = user_act_annotation
+
     return gm
 
 
diff --git a/colossalai/auto_parallel/passes/runtime_preparation_pass.py b/colossalai/auto_parallel/passes/runtime_preparation_pass.py
index f9b89026393d..1c25e4c94f24 100644
--- a/colossalai/auto_parallel/passes/runtime_preparation_pass.py
+++ b/colossalai/auto_parallel/passes/runtime_preparation_pass.py
@@ -179,6 +179,8 @@ def _size_value_converting(gm: torch.fx.GraphModule, device_mesh: DeviceMesh):
                 # It will be used to replace the original node with processing node in slice object
                 node_pairs[node] = size_processing_node
                 size_processing_node._meta_data = node._meta_data
+                if 'activation_checkpoint' in node.meta:
+                    size_processing_node.meta['activation_checkpoint'] = node.meta['activation_checkpoint']
 
             user_list = list(node.users.keys())
             for user in user_list:
diff --git a/colossalai/auto_parallel/tensor_shard/initialize.py b/colossalai/auto_parallel/tensor_shard/initialize.py
index 8c24c0d7b5df..387a682a1ad9 100644
--- a/colossalai/auto_parallel/tensor_shard/initialize.py
+++ b/colossalai/auto_parallel/tensor_shard/initialize.py
@@ -18,6 +18,7 @@
 )
 from colossalai.device.alpha_beta_profiler import AlphaBetaProfiler
 from colossalai.device.device_mesh import DeviceMesh
+from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.tracer import ColoTracer
 from colossalai.tensor.sharding_spec import ShardingSpec
 
@@ -28,7 +29,7 @@ class ModuleWrapper(nn.Module):
     into the forward function.
     '''
 
-    def __init__(self, module: GraphModule, sharding_spec_dict: Dict[int, List[ShardingSpec]],
+    def __init__(self, module: ColoGraphModule, sharding_spec_dict: Dict[int, List[ShardingSpec]],
                  origin_spec_dict: Dict[int, ShardingSpec], comm_actions_dict: Dict[int, Dict[str, CommAction]]):
         '''
         Args:
@@ -81,7 +82,7 @@ def build_strategy_constructor(graph: Graph, device_mesh: DeviceMesh):
     return strategies_constructor
 
 
-def solve_solution(gm: GraphModule, strategy_constructor: StrategiesConstructor, memory_budget: float = -1.0):
+def solve_solution(gm: ColoGraphModule, strategy_constructor: StrategiesConstructor, memory_budget: float = -1.0):
     '''
     This method is used to solve the best solution for the given graph.
     The solution is a list of integers, each integer represents the best strategy index of the corresponding node.
@@ -97,7 +98,7 @@ def solve_solution(gm: GraphModule, strategy_constructor: StrategiesConstructor,
     return solution
 
 
-def transform_to_sharded_model(gm: GraphModule, solution: List[int], device_mesh: DeviceMesh,
+def transform_to_sharded_model(gm: ColoGraphModule, solution: List[int], device_mesh: DeviceMesh,
                                strategies_constructor: StrategiesConstructor):
     '''
     This method is used to transform the original graph to the sharded graph.
@@ -197,10 +198,10 @@ def initialize_model(model: nn.Module,
             solution will be used to debug or help to analyze the sharding result. Therefore, we will not just
             return a series of integers, but return the best strategies.
     '''
-    tracer = ColoTracer()
+    tracer = ColoTracer(trace_act_ckpt=True)
 
     graph = tracer.trace(root=model, meta_args=meta_args)
-    gm = GraphModule(model, graph, model.__class__.__name__)
+    gm = ColoGraphModule(model, graph, model.__class__.__name__)
     gm.recompile()
     strategies_constructor = build_strategy_constructor(graph, device_mesh)
     if load_solver_solution:
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_checkpoint.py b/tests/test_auto_parallel/test_tensor_shard/test_checkpoint.py
new file mode 100644
index 000000000000..0b42722fec5f
--- /dev/null
+++ b/tests/test_auto_parallel/test_tensor_shard/test_checkpoint.py
@@ -0,0 +1,70 @@
+from functools import partial
+from typing import Optional, Tuple, Union
+
+import pytest
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from transformers.pytorch_utils import Conv1D
+
+from colossalai.auto_parallel.tensor_shard.initialize import initialize_model
+from colossalai.device.device_mesh import DeviceMesh
+from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.fx.tracer import ColoTracer
+from colossalai.initialize import launch
+from colossalai.logging import disable_existing_loggers
+from colossalai.tensor.shape_consistency import ShapeConsistencyManager
+from colossalai.testing import rerun_if_address_is_in_use
+from colossalai.testing.pytest_wrapper import run_on_environment_flag
+from colossalai.utils import free_port
+
+HIDDEN_SIZE = 16
+
+
+class GPT2MLPWithCkpt(nn.Module):
+
+    def __init__(self, intermediate_size, hidden_size):
+        super().__init__()
+        embed_dim = hidden_size
+        self.c_fc = Conv1D(intermediate_size, embed_dim)
+        self.c_proj = Conv1D(embed_dim, intermediate_size)
+        self.act = torch.nn.ReLU()
+
+    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = checkpoint(self.c_proj, hidden_states)
+        hidden_states = self.act(hidden_states)
+
+        return hidden_states
+
+
+def check_act_ckpt(rank, world_size, port):
+    disable_existing_loggers()
+    launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    model = GPT2MLPWithCkpt(intermediate_size=4 * HIDDEN_SIZE, hidden_size=HIDDEN_SIZE)
+    input_sample = {
+        'hidden_states': torch.rand(1, 64, HIDDEN_SIZE).to('meta'),
+    }
+    physical_mesh_id = torch.arange(0, 4)
+    mesh_shape = (2, 2)
+    # [[0, 1]
+    #  [2, 3]]
+    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
+    gm = initialize_model(model, input_sample, device_mesh)
+    code = gm.module.graph.python_code('self').src
+    assert "runtime_comm_spec_apply_1 = colossalai_auto_parallel_passes_runtime_apply_pass_runtime_comm_spec_apply(linear_1, comm_actions_dict, 12, 'linear_1')" in code
+    assert "view_3 = colossalai.utils.activation_checkpoint.checkpoint(self.checkpoint_0, False, view_1, comm_actions_dict, use_reentrant=True)" in code
+
+
+@run_on_environment_flag(name='AUTO_PARALLEL')
+@pytest.mark.dist
+@rerun_if_address_is_in_use()
+def test_mlp_layer():
+    world_size = 4
+    run_func = partial(check_act_ckpt, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_mlp_layer()

From 4953b4ace15da59f4d122c5e433b08ad9db13054 Mon Sep 17 00:00:00 2001
From: oahzxl <43881818+oahzxl@users.noreply.github.com>
Date: Mon, 16 Jan 2023 19:25:05 +0800
Subject: [PATCH 192/209] [autochunk] support evoformer tracer (#2485)

support full evoformer tracer, which is a main module of alphafold. previously we just support a simplifed version of it.
1. support some evoformer's op in fx
2. support evoformer test
3. add repos for test code
---
 colossalai/autochunk/autochunk_codegen.py     | 168 ++----
 colossalai/autochunk/trace_flow.py            |  92 +--
 colossalai/autochunk/trace_indice.py          |  62 +-
 colossalai/autochunk/utils.py                 |  48 +-
 colossalai/fx/profiler/opcount.py             |   5 +-
 ...chunk.py => benchmark_simple_evoformer.py} |  50 +-
 tests/test_autochunk/evoformer/evoformer.py   |  59 --
 tests/test_autochunk/evoformer/initializer.py |  29 -
 tests/test_autochunk/evoformer/kernel.py      |  19 -
 tests/test_autochunk/evoformer/msa.py         |  95 ----
 tests/test_autochunk/evoformer/ops.py         | 176 ------
 tests/test_autochunk/evoformer/triangle.py    | 192 -------
 .../test_autochunk/openfold/checkpointing.py  |  84 ---
 tests/test_autochunk/openfold/dropout.py      |  78 ---
 tests/test_autochunk/openfold/evoformer.py    | 431 --------------
 tests/test_autochunk/openfold/msa.py          | 331 -----------
 .../openfold/outer_product_mean.py            | 129 -----
 .../openfold/pair_transition.py               |  99 ----
 tests/test_autochunk/openfold/primitives.py   | 529 ------------------
 tests/test_autochunk/openfold/tensor_utils.py | 408 --------------
 .../openfold/triangular_attention.py          | 139 -----
 .../triangular_multiplicative_update.py       | 127 -----
 .../test_autochunk/test_evoformer_codegen.py  | 164 ++++++
 ...en.py => test_simple_evoformer_codegen.py} |  20 +-
 ...rch.py => test_simple_evoformer_search.py} |  20 +-
 25 files changed, 339 insertions(+), 3215 deletions(-)
 rename tests/test_autochunk/{benchmark_autochunk.py => benchmark_simple_evoformer.py} (66%)
 delete mode 100644 tests/test_autochunk/evoformer/evoformer.py
 delete mode 100755 tests/test_autochunk/evoformer/initializer.py
 delete mode 100644 tests/test_autochunk/evoformer/kernel.py
 delete mode 100644 tests/test_autochunk/evoformer/msa.py
 delete mode 100755 tests/test_autochunk/evoformer/ops.py
 delete mode 100644 tests/test_autochunk/evoformer/triangle.py
 delete mode 100644 tests/test_autochunk/openfold/checkpointing.py
 delete mode 100644 tests/test_autochunk/openfold/dropout.py
 delete mode 100644 tests/test_autochunk/openfold/evoformer.py
 delete mode 100644 tests/test_autochunk/openfold/msa.py
 delete mode 100644 tests/test_autochunk/openfold/outer_product_mean.py
 delete mode 100644 tests/test_autochunk/openfold/pair_transition.py
 delete mode 100644 tests/test_autochunk/openfold/primitives.py
 delete mode 100644 tests/test_autochunk/openfold/tensor_utils.py
 delete mode 100644 tests/test_autochunk/openfold/triangular_attention.py
 delete mode 100644 tests/test_autochunk/openfold/triangular_multiplicative_update.py
 create mode 100644 tests/test_autochunk/test_evoformer_codegen.py
 rename tests/test_autochunk/{test_autochunk_codegen.py => test_simple_evoformer_codegen.py} (88%)
 rename tests/test_autochunk/{test_autochunk_search.py => test_simple_evoformer_search.py} (87%)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index e8af9bde86d8..ceccb9a9fde2 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -48,9 +48,7 @@ def _gen_chunk_slice_dim(chunk_dim: int, chunk_indice_name: str, shape: List) ->
     return new_shape
 
 
-def _gen_loop_start(
-    chunk_input: List[Node], chunk_output: Node, chunk_ouput_dim: int, chunk_size=2
-) -> str:
+def _gen_loop_start(chunk_input: List[Node], chunk_output: Node, chunk_ouput_dim: int, chunk_size=2) -> str:
     """
     Generate chunk loop start
 
@@ -72,9 +70,8 @@ def _gen_loop_start(
     out_shape = get_node_shape(chunk_output)
     out_str = str(list(out_shape))
     context = (
-        "chunk_result = torch.empty(%s, dtype=%s.dtype, device=%s.device); chunk_size = %d\nfor chunk_idx in range"
-        % (out_str, input_node.name, input_node.name, chunk_size)
-    )
+        "chunk_result = torch.empty(%s, dtype=%s.dtype, device=%s.device); chunk_size = %d\nfor chunk_idx in range" %
+        (out_str, input_node.name, input_node.name, chunk_size))
     context += "(0, %d, chunk_size):\n" % (out_shape[chunk_ouput_dim])
     return context
 
@@ -105,26 +102,17 @@ def _gen_loop_end(
     chunk_outputs_name = chunk_outputs.name
     chunk_outputs_idx = find_idx_by_name(chunk_outputs_name, node_list)
     chunk_output_shape = chunk_outputs.meta["tensor_meta"].shape
-    chunk_slice = _gen_chunk_slice_dim(
-        chunk_outputs_dim, "chunk_idx", chunk_output_shape
-    )
+    chunk_slice = _gen_chunk_slice_dim(chunk_outputs_dim, "chunk_idx", chunk_output_shape)
     context = "    chunk_result%s = %s;  %s = None\n" % (
         chunk_slice,
         chunk_outputs_name,
         chunk_outputs_name,
     )
-    context += (
-        chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None"
-    )
+    context += (chunk_outputs_name + " = chunk_result;  chunk_result = None;  chunk_size = None")
 
     # determine if its the last use for chunk input
     for chunk_input in chunk_inputs + chunk_non_compute_inputs:
-        if all(
-            [
-                find_idx_by_name(user.name, node_list) <= chunk_outputs_idx
-                for user in chunk_input.users.keys()
-            ]
-        ):
+        if all([find_idx_by_name(user.name, node_list) <= chunk_outputs_idx for user in chunk_input.users.keys()]):
             context += ";  %s = None" % chunk_input.name
 
     context += "\n"
@@ -171,17 +159,10 @@ def _replace_ones_like(
         chunk_dim = chunk_infos[region_idx]["node_chunk_dim"][meta_node]["chunk_dim"]
         if get_node_shape(meta_node)[chunk_dim] != 1:
             source_node = meta_node.args[0].args[0]
-            if (
-                source_node not in chunk_infos[region_idx]["node_chunk_dim"]
-                or chunk_infos[region_idx]["node_chunk_dim"][source_node]["chunk_dim"]
-                is None
-            ):
-                chunk_slice = _gen_chunk_slice_dim(
-                    chunk_dim, "chunk_idx", get_node_shape(node)
-                )
-                body[-1] = _replace_name(
-                    body[-1], node.args[0].name, node.args[0].name + chunk_slice
-                )
+            if (source_node not in chunk_infos[region_idx]["node_chunk_dim"]
+                    or chunk_infos[region_idx]["node_chunk_dim"][source_node]["chunk_dim"] is None):
+                chunk_slice = _gen_chunk_slice_dim(chunk_dim, "chunk_idx", get_node_shape(node))
+                body[-1] = _replace_name(body[-1], node.args[0].name, node.args[0].name + chunk_slice)
     return body
 
 
@@ -198,12 +179,8 @@ def _replace_input_node(
     for input_node_idx, input_node in enumerate(chunk_inputs[region_idx]):
         for idx, dim in chunk_inputs_dim[region_idx][input_node_idx].items():
             if idx == node_idx:
-                chunk_slice = _gen_chunk_slice_dim(
-                    dim[0], "chunk_idx", get_node_shape(input_node)
-                )
-                body[-1] = _replace_name(
-                    body[-1], input_node.name, input_node.name + chunk_slice
-                )
+                chunk_slice = _gen_chunk_slice_dim(dim[0], "chunk_idx", get_node_shape(input_node))
+                body[-1] = _replace_name(body[-1], input_node.name, input_node.name + chunk_slice)
     return body
 
 
@@ -236,14 +213,10 @@ def emit_code_with_chunk(
     chunk_ends = [i["region"][1] for i in chunk_infos]
 
     # chunk inputs
-    chunk_inputs = [i["inputs"] for i in chunk_infos]  # input with chunk
-    chunk_inputs_non_chunk = [
-        i["inputs_non_chunk"] for i in chunk_infos
-    ]  # input without chunk
-    chunk_inputs_dim = [i["inputs_dim"] for i in chunk_infos]  # input chunk dim
-    chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [
-        j.name for i in chunk_inputs_non_chunk for j in i
-    ]
+    chunk_inputs = [i["inputs"] for i in chunk_infos]    # input with chunk
+    chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos]    # input without chunk
+    chunk_inputs_dim = [i["inputs_dim"] for i in chunk_infos]    # input chunk dim
+    chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [j.name for i in chunk_inputs_non_chunk for j in i]
 
     # chunk outputs
     chunk_outputs = [i["outputs"][0] for i in chunk_infos]
@@ -267,23 +240,16 @@ def emit_code_with_chunk(
                     chunk_outputs[region_idx],
                     chunk_outputs_dim[region_idx],
                     chunk_infos[region_idx]["chunk_size"],
-                )
-            )
+                ))
 
         if within_chunk_region:
             emit_node_func(node, body)
             # replace input var with chunk var
-            body = _replace_input_node(
-                chunk_inputs, region_idx, chunk_inputs_dim, node_idx, body
-            )
+            body = _replace_input_node(chunk_inputs, region_idx, chunk_inputs_dim, node_idx, body)
             # ones like
-            body = _replace_ones_like(
-                search_chunk, chunk_infos, region_idx, node_idx, node, body
-            )
+            body = _replace_ones_like(search_chunk, chunk_infos, region_idx, node_idx, node, body)
             # reassgin reshape size
-            body[-1] = _replace_reshape_size(
-                body[-1], node.name, chunk_infos[region_idx]["reshape_size"]
-            )
+            body[-1] = _replace_reshape_size(body[-1], node.name, chunk_infos[region_idx]["reshape_size"])
             body[-1] = "    " + body[-1]
             delete_unused_value_func(node, body, chunk_inputs_names)
         else:
@@ -300,8 +266,7 @@ def emit_code_with_chunk(
                     chunk_outputs[region_idx],
                     chunk_outputs_dim[region_idx],
                     node_list,
-                )
-            )
+                ))
             within_chunk_region = False
 
         node_idx += 1
@@ -310,18 +275,14 @@ def emit_code_with_chunk(
 if CODEGEN_AVAILABLE:
 
     class AutoChunkCodeGen(CodeGen):
+
         def __init__(self, meta_graph, max_memory=None, print_mem=False):
             super().__init__()
-            self.meta_graph = meta_graph
-            self.max_memory = max_memory
-            self.meta_node = list(meta_graph.graph.nodes)
             # find the chunk regions
             self.search_chunk = SearchChunk(meta_graph, max_memory, print_mem)
             self.chunk_infos = self.search_chunk.search_region()
 
-        def _gen_python_code(
-            self, nodes, root_module: str, namespace: _Namespace
-        ) -> PythonCode:
+        def _gen_python_code(self, nodes, root_module: str, namespace: _Namespace) -> PythonCode:
             free_vars: List[str] = []
             body: List[str] = []
             globals_: Dict[str, Any] = {}
@@ -338,9 +299,7 @@ def add_global(name_hint: str, obj: Any):
 
                 Returns: the global name that should be used to reference 'obj' in generated source.
                 """
-                if (
-                    _is_from_torch(obj) and obj != torch.device
-                ):  # to support registering torch.device
+                if (_is_from_torch(obj) and obj != torch.device):    # to support registering torch.device
                     # HACK: workaround for how torch custom ops are registered. We
                     # can't import them like normal modules so they must retain their
                     # fully qualified name.
@@ -356,9 +315,7 @@ def add_global(name_hint: str, obj: Any):
                 return global_name
 
             # set _custom_builtins here so that we needn't import colossalai in forward
-            _custom_builtins["colossalai"] = _CustomBuiltin(
-                "import colossalai", colossalai
-            )
+            _custom_builtins["colossalai"] = _CustomBuiltin("import colossalai", colossalai)
 
             # Pre-fill the globals table with registered builtins.
             for name, (_, obj) in _custom_builtins.items():
@@ -394,9 +351,8 @@ def type_repr(o: Any):
                 # Common case: this is a regular module name like 'foo.bar.baz'
                 return add_global(typename, o)
 
-            def _format_args(
-                args: Tuple[Argument, ...], kwargs: Dict[str, Argument]
-            ) -> str:
+            def _format_args(args: Tuple[Argument, ...], kwargs: Dict[str, Argument]) -> str:
+
                 def _get_repr(arg):
                     # Handle NamedTuples (if it has `_fields`) via add_global.
                     if isinstance(arg, tuple) and hasattr(arg, "_fields"):
@@ -444,26 +400,18 @@ def delete_unused_values(user: Node, body, to_keep=[]):
                 nodes_to_delete = user_to_last_uses.get(user, [])
                 nodes_to_delete = [i for i in nodes_to_delete if i.name not in to_keep]
                 if len(nodes_to_delete):
-                    to_delete_str = " = ".join(
-                        [repr(n) for n in nodes_to_delete] + ["None"]
-                    )
+                    to_delete_str = " = ".join([repr(n) for n in nodes_to_delete] + ["None"])
                     body.append(f";  {to_delete_str}\n")
                 else:
                     body.append("\n")
 
             # NOTE: we add a variable to distinguish body and ckpt_func
             def emit_node(node: Node, body):
-                maybe_type_annotation = (
-                    "" if node.type is None else f" : {type_repr(node.type)}"
-                )
+                maybe_type_annotation = ("" if node.type is None else f" : {type_repr(node.type)}")
                 if node.op == "placeholder":
                     assert isinstance(node.target, str)
-                    maybe_default_arg = (
-                        "" if not node.args else f" = {repr(node.args[0])}"
-                    )
-                    free_vars.append(
-                        f"{node.target}{maybe_type_annotation}{maybe_default_arg}"
-                    )
+                    maybe_default_arg = ("" if not node.args else f" = {repr(node.args[0])}")
+                    free_vars.append(f"{node.target}{maybe_type_annotation}{maybe_default_arg}")
                     raw_name = node.target.replace("*", "")
                     if raw_name != repr(node):
                         body.append(f"{repr(node)} = {raw_name}\n")
@@ -472,68 +420,46 @@ def emit_node(node: Node, body):
                     assert isinstance(node.target, str)
                     body.append(
                         f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.target)}"
-                        f"({_format_args(node.args[1:], node.kwargs)})"
-                    )
+                        f"({_format_args(node.args[1:], node.kwargs)})")
                     return
                 elif node.op == "call_function":
                     assert callable(node.target)
                     # pretty print operators
-                    if (
-                        node.target.__module__ == "_operator"
-                        and node.target.__name__ in magic_methods
-                    ):
+                    if (node.target.__module__ == "_operator" and node.target.__name__ in magic_methods):
                         assert isinstance(node.args, tuple)
-                        body.append(
-                            f"{repr(node)}{maybe_type_annotation} = "
-                            f"{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}"
-                        )
+                        body.append(f"{repr(node)}{maybe_type_annotation} = "
+                                    f"{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}")
                         return
 
                     # pretty print inplace operators; required for jit.script to work properly
                     # not currently supported in normal FX graphs, but generated by torchdynamo
-                    if (
-                        node.target.__module__ == "_operator"
-                        and node.target.__name__ in inplace_methods
-                    ):
-                        body.append(
-                            f"{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  "
-                            f"{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}"
-                        )
+                    if (node.target.__module__ == "_operator" and node.target.__name__ in inplace_methods):
+                        body.append(f"{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  "
+                                    f"{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}")
                         return
 
                     qualified_name = _get_qualified_name(node.target)
                     global_name = add_global(qualified_name, node.target)
                     # special case for getattr: node.args could be 2-argument or 3-argument
                     # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
-                    if (
-                        global_name == "getattr"
-                        and isinstance(node.args, tuple)
-                        and isinstance(node.args[1], str)
-                        and node.args[1].isidentifier()
-                        and len(node.args) == 2
-                    ):
+                    if (global_name == "getattr" and isinstance(node.args, tuple) and isinstance(node.args[1], str)
+                            and node.args[1].isidentifier() and len(node.args) == 2):
                         body.append(
-                            f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}"
-                        )
+                            f"{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}")
                         return
                     body.append(
-                        f"{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})"
-                    )
+                        f"{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})")
                     if node.meta.get("is_wrapped", False):
                         wrapped_fns.setdefault(global_name)
                     return
                 elif node.op == "call_module":
                     assert isinstance(node.target, str)
-                    body.append(
-                        f"{repr(node)}{maybe_type_annotation} = "
-                        f"{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})"
-                    )
+                    body.append(f"{repr(node)}{maybe_type_annotation} = "
+                                f"{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})")
                     return
                 elif node.op == "get_attr":
                     assert isinstance(node.target, str)
-                    body.append(
-                        f"{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}"
-                    )
+                    body.append(f"{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}")
                     return
                 elif node.op == "output":
                     if node.type is not None:
@@ -564,9 +490,7 @@ def emit_node(node: Node, body):
 
             if len(wrapped_fns) > 0:
                 wrap_name = add_global("wrap", torch.fx.wrap)
-                wrap_stmts = "\n".join(
-                    [f'{wrap_name}("{name}")' for name in wrapped_fns]
-                )
+                wrap_stmts = "\n".join([f'{wrap_name}("{name}")' for name in wrapped_fns])
             else:
                 wrap_stmts = ""
 
diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py
index 1e2e6dc1258b..ec1e012beb17 100644
--- a/colossalai/autochunk/trace_flow.py
+++ b/colossalai/autochunk/trace_flow.py
@@ -10,6 +10,7 @@
 
 
 class TraceFlow(object):
+
     def __init__(self, trace_indice: TraceIndice) -> None:
         self.trace_indice = trace_indice
 
@@ -28,9 +29,7 @@ def check_index_source(self, start_dim, start_node, start_idx, end_dim, end_node
         start_node_idx = find_idx_by_name(start_node.name, self.trace_indice.node_list)
         end_node_trace = self.trace_indice._find_trace_from_node(end_node)
         end_node_trace_source = end_node_trace["source"][end_dim]
-        sorted_source = sorted(
-            end_node_trace_source.items(), key=lambda d: d[0], reverse=True
-        )
+        sorted_source = sorted(end_node_trace_source.items(), key=lambda d: d[0], reverse=True)
         for node_idx, node_dim in sorted_source:
             if node_idx == start_node_idx and start_dim in node_dim:
                 return True
@@ -70,10 +69,8 @@ def _find_inherit_dim(self, input_node, input_dim, node):
         input_node_idx = find_idx_by_name(input_node.name, self.trace_indice.node_list)
         node_trace_source = self.trace_indice._find_source_trace_from_node(node)
         for node_dim in range(len(get_node_shape(node))):
-            if (
-                input_node_idx in node_trace_source[node_dim]
-                and input_dim[0] in node_trace_source[node_dim][input_node_idx]
-            ):
+            if (input_node_idx in node_trace_source[node_dim]
+                    and input_dim[0] in node_trace_source[node_dim][input_node_idx]):
                 return node_dim
         return None
 
@@ -81,15 +78,11 @@ def check_index_duplicate(self, chunk_infos, return_dim=False):
         input_dim_after_node = {}
         for input_node_idx, input_node in enumerate(chunk_infos["inputs"]):
             for k, v in chunk_infos["inputs_dim"][input_node_idx].items():
-                inherit_dim = self._find_inherit_dim(
-                    input_node, v, self.trace_indice.node_list[k]
-                )
+                inherit_dim = self._find_inherit_dim(input_node, v, self.trace_indice.node_list[k])
                 if inherit_dim:
                     input_dim_after_node[k] = inherit_dim
 
-        for node in self.trace_indice.node_list[
-            chunk_infos["region"][0] : chunk_infos["region"][1] + 1
-        ]:
+        for node in self.trace_indice.node_list[chunk_infos["region"][0]:chunk_infos["region"][1] + 1]:
             if is_non_compute_node_except_placeholder(node):
                 continue
             count = 0
@@ -159,9 +152,7 @@ def _assgin_single_node_flow(
         if arg_node in all_node_info:
             if all_node_info[arg_node]["chunk_dim"] != arg_dim:
                 return False
-            all_node_info[arg_node]["fix_dim"] = list(
-                set(all_node_info[arg_node]["fix_dim"] + arg_fix_dim)
-            )
+            all_node_info[arg_node]["fix_dim"] = list(set(all_node_info[arg_node]["fix_dim"] + arg_fix_dim))
         # else add it to list
         else:
             all_node_info[arg_node] = {"chunk_dim": arg_dim, "fix_dim": arg_fix_dim}
@@ -170,9 +161,7 @@ def _assgin_single_node_flow(
         return True
 
     def _get_all_node_info(self, end_dim, start_idx, end_idx):
-        cur_node_list = [
-            self.trace_indice.node_list[end_idx]
-        ]  # start from the last node
+        cur_node_list = [self.trace_indice.node_list[end_idx]]    # start from the last node
         all_node_info = {cur_node_list[0]: {"chunk_dim": end_dim, "fix_dim": []}}
 
         while len(cur_node_list) > 0:
@@ -183,12 +172,8 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx):
                 cur_node_chunk_dim = all_node_info[cur_node]["chunk_dim"]
                 cur_node_fix_dim = all_node_info[cur_node]["fix_dim"]
                 if cur_node_chunk_dim:
-                    cur_node_compute = self.trace_indice._find_compute_trace_from_node(
-                        cur_node
-                    )
-                    cur_node_source = self.trace_indice._find_source_trace_from_node(
-                        cur_node
-                    )
+                    cur_node_compute = self.trace_indice._find_compute_trace_from_node(cur_node)
+                    cur_node_source = self.trace_indice._find_source_trace_from_node(cur_node)
                 else:
                     cur_node_compute = cur_node_source = None
 
@@ -215,15 +200,9 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx):
                         return None
 
                 if len(arg_list) == 2:
-                    if any(i in cur_node.name for i in ["add", "mul"]):
+                    if any(i in cur_node.name for i in ["add", "mul", "truediv"]):
                         for arg in arg_list:
-                            if not (
-                                start_idx
-                                <= find_idx_by_name(
-                                    arg.name, self.trace_indice.node_list
-                                )
-                                < end_idx
-                            ):
+                            if not (start_idx <= find_idx_by_name(arg.name, self.trace_indice.node_list) < end_idx):
                                 continue
                             arg_chunk_dim = all_node_info[arg]["chunk_dim"]
                             arg_fix_dim = all_node_info[arg]["fix_dim"]
@@ -249,9 +228,7 @@ def _get_input_nodes_dim(self, inputs, start_idx, end_idx, all_node_info):
         remove_inputs = []
         for input_node in inputs:
             input_dict = {}
-            input_node_idx = find_idx_by_name(
-                input_node.name, self.trace_indice.node_list
-            )
+            input_node_idx = find_idx_by_name(input_node.name, self.trace_indice.node_list)
             for user in input_node.users.keys():
                 if is_non_compute_node(user):
                     continue
@@ -259,9 +236,7 @@ def _get_input_nodes_dim(self, inputs, start_idx, end_idx, all_node_info):
                 if start_idx <= user_idx <= end_idx:
                     chunk_dim = all_node_info[user]["chunk_dim"]
                     if chunk_dim is not None:
-                        user_source = self.trace_indice._find_source_trace_from_node(
-                            user
-                        )[chunk_dim]
+                        user_source = self.trace_indice._find_source_trace_from_node(user)[chunk_dim]
                         if input_node_idx in user_source:
                             input_dict[user_idx] = user_source[input_node_idx]
                         else:
@@ -284,7 +259,7 @@ def _get_prepose_nodes(self, all_node_info, start_idx, end_idx):
         maybe_prepose_nodes.sort(
             key=lambda x: find_idx_by_name(x.name, self.trace_indice.node_list),
             reverse=True,
-        )  # from last node to first node
+        )    # from last node to first node
         prepose_nodes = []
         # set every node as root, search its args, if all legal, turn root and args as prepose nodes
         while len(maybe_prepose_nodes) > 0:
@@ -305,13 +280,8 @@ def _get_prepose_nodes(self, all_node_info, start_idx, end_idx):
                         if type(cur_prepose_node_arg) != type(cur_prepose_node):
                             continue
                         # out of loop
-                        if not (
-                            start_idx
-                            <= find_idx_by_name(
-                                cur_prepose_node_arg.name, self.trace_indice.node_list
-                            )
-                            < end_idx
-                        ):
+                        if not (start_idx <= find_idx_by_name(cur_prepose_node_arg.name, self.trace_indice.node_list) <
+                                end_idx):
                             continue
                         # compute op in loop
                         elif cur_prepose_node_arg in all_node_info:
@@ -335,15 +305,13 @@ def _get_prepose_nodes(self, all_node_info, start_idx, end_idx):
                     if n in maybe_prepose_nodes:
                         maybe_prepose_nodes.remove(n)
         # sort by index
-        prepose_nodes.sort(
-            key=lambda x: find_idx_by_name(x.name, self.trace_indice.node_list)
-        )
+        prepose_nodes.sort(key=lambda x: find_idx_by_name(x.name, self.trace_indice.node_list))
 
         return prepose_nodes
 
     def _get_non_chunk_inputs(self, chunk_info, start_idx, end_idx):
         # we need to log input nodes to avoid deleteing them in the loop
-        chunk_node_list = self.trace_indice.node_list[start_idx : end_idx + 1]
+        chunk_node_list = self.trace_indice.node_list[start_idx:end_idx + 1]
         # also need to get some prepose node's arg out of non_chunk_inputs
         for n in chunk_info["args"]["prepose_nodes"]:
             chunk_node_list.remove(n)
@@ -354,9 +322,7 @@ def _get_non_chunk_inputs(self, chunk_info, start_idx, end_idx):
         return chunk_info
 
     def flow_search(self, start_idx, start_dim, end_idx, end_dim):
-        inputs, outputs = find_chunk_compute_input_and_output_nodes(
-            self.trace_indice.node_list[start_idx : end_idx + 1]
-        )
+        inputs, outputs = find_chunk_compute_input_and_output_nodes(self.trace_indice.node_list[start_idx:end_idx + 1])
         # only single ouput
         if len(outputs) > 1:
             return None
@@ -367,9 +333,7 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
             return None
 
         # get input nodes' chunk dim
-        inputs, inputs_dim = self._get_input_nodes_dim(
-            inputs, start_idx, end_idx, all_node_info
-        )
+        inputs, inputs_dim = self._get_input_nodes_dim(inputs, start_idx, end_idx, all_node_info)
         if inputs is None:
             return None
 
@@ -385,9 +349,7 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
         }
 
         # move useless nodes ahead of loop
-        chunk_info["args"]["prepose_nodes"] = self._get_prepose_nodes(
-            all_node_info, start_idx, end_idx
-        )
+        chunk_info["args"]["prepose_nodes"] = self._get_prepose_nodes(all_node_info, start_idx, end_idx)
 
         # find non chunk inputs
         chunk_info = self._get_non_chunk_inputs(chunk_info, start_idx, end_idx)
@@ -400,10 +362,8 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
     def _reassgin_reshape_size(self, chunk_info):
         chunk_region = chunk_info["region"]
         reshape_size = {}
-        chunk_shape = get_node_shape(chunk_info["outputs"][0])[
-            chunk_info["outputs_dim"]
-        ]
-        for node in self.trace_indice.node_list[chunk_region[0] : chunk_region[1] + 1]:
+        chunk_shape = get_node_shape(chunk_info["outputs"][0])[chunk_info["outputs_dim"]]
+        for node in self.trace_indice.node_list[chunk_region[0]:chunk_region[1] + 1]:
             if any(i in node.name for i in ["reshape", "view"]):
                 reshape_args = node.args[1:]
                 reshape_log = self.trace_indice.indice_view_list[node]
@@ -413,8 +373,6 @@ def _reassgin_reshape_size(self, chunk_info):
                     if reshape_arg_dim in reshape_log["dim_to"]:
                         continue
                     if reshape_arg_dim == chunk_dim:
-                        reshape_size[node.name][reshape_arg.name] = (
-                            "min(chunk_size, %d - chunk_idx)" % chunk_shape
-                        )
+                        reshape_size[node.name][reshape_arg.name] = ("min(chunk_size, %d - chunk_idx)" % chunk_shape)
         chunk_info["reshape_size"] = reshape_size
         return chunk_info
diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 1e16ab9bdf35..5a5d15e0a1f4 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -3,7 +3,7 @@
 
 from torch.fx.node import Node
 
-from .utils import find_idx_by_name, get_node_shape
+from .utils import find_first_tensor_arg, find_idx_by_name, get_node_shape, unflat_list
 
 
 class TraceIndice(object):
@@ -79,9 +79,7 @@ def _inherit_indice(self, node_from, node_from_dim, node_to, node_to_dim):
         node_from_trace = self._find_trace_from_node(node_from)
         node_to_trace = self._find_trace_from_node(node_to)
         node_to_trace["indice"][node_to_dim] = node_from_trace["indice"][node_from_dim]
-        node_to_trace["compute"][node_to_dim] = copy.deepcopy(
-            node_from_trace["compute"][node_from_dim]
-        )
+        node_to_trace["compute"][node_to_dim] = copy.deepcopy(node_from_trace["compute"][node_from_dim])
         self._add_source(node_from, node_from_dim, node_to, node_to_dim, init=True)
 
     def _inherit_all_computation(self, node_from, node_to):
@@ -209,7 +207,7 @@ def _assign_indice_as_input(self, node, node_idx, input_node=None):
             node_idx (int)
         """
         if input_node == None:
-            input_node = node.args[0]
+            input_node = find_first_tensor_arg(node)
         input_node_idx = find_idx_by_name(input_node.name, self.node_list)
         input_node_idx_trace = self.indice_trace_list[input_node_idx]["indice"]
 
@@ -227,6 +225,8 @@ def _assign_all_indice(self, node, node_idx):
             node_idx (int)
         """
         shape = node.meta["tensor_meta"].shape
+        if shape is None:
+            return
         new_trace = []
         for _ in shape:
             new_trace.append(self._add_indice())
@@ -259,7 +259,7 @@ def _assign_permute_indice(self, node, node_idx):
             node (node)
             node_idx (int)
         """
-        permute_dim = node.args[1:]
+        permute_dim = unflat_list(node.args[1:])
         input_node = node.args[0]
 
         self._assign_indice_as_input(node, node_idx, input_node)
@@ -359,6 +359,15 @@ def _assign_einsum_indice(self, node, idx):
         left, right = patterns.split("->")
         left = left.split(",")
 
+        if '...' in right:
+            replace_list = "!@#$%^&*"
+            target_len = len(get_node_shape(node))
+            add_len = target_len - len(right) + 3
+            replace_str = replace_list[:add_len]
+            right = right.replace("...", replace_str)
+            for ll in range(len(left)):
+                left[ll] = left[ll].replace("...", replace_str)
+
         all_index = []
         for i in left:
             for c in i:
@@ -369,9 +378,7 @@ def _assign_einsum_indice(self, node, idx):
             for left_idx, left_str in enumerate(left):
                 if right_indice in left_str:
                     source_idx = left_str.index(right_indice)
-                    self._inherit_indice(
-                        input_nodes[left_idx], source_idx, node, right_idx
-                    )
+                    self._inherit_indice(input_nodes[left_idx], source_idx, node, right_idx)
 
     def _assign_softmax_indice(self, node, idx):
         """
@@ -440,11 +447,12 @@ def _assign_view_reshape_indice(self, node, node_idx):
         origin_node = node.args[0]
         origin_shape = origin_node.meta["tensor_meta"].shape
         target_shape = []
-        for i in range(1, len(node.args)):
-            if isinstance(node.args[i], int):
-                target_shape.append(node.args[i])
+        unflated_args = unflat_list(node.args)
+        for i in range(1, len(unflated_args)):
+            if isinstance(unflated_args[i], int):
+                target_shape.append(unflated_args[i])
             else:
-                target_shape.append(node.args[i].meta["fwd_out"][0])
+                target_shape.append(unflated_args[i].meta["fwd_out"][0])
 
         # compute the value of -1
         if -1 in target_shape:
@@ -472,13 +480,7 @@ def _assign_view_reshape_indice(self, node, node_idx):
             dim_to = [dim_equal.index(False), dim_equal.index(False) + 1]
             self._del_dim(node_idx, -1)
         else:
-            raise NotImplementedError(
-                "shape"
-                + str(origin_shape)
-                + "and"
-                + str(target_shape)
-                + "view not implemented"
-            )
+            raise NotImplementedError("shape" + str(origin_shape) + "and" + str(target_shape) + "view not implemented")
 
         # get new indice
         origin_trace = self._find_indice_trace_from_node(origin_node)
@@ -521,6 +523,8 @@ def trace_indice(self):
                     self._assign_unsqueeze_indice(node, idx)
                 elif any(i in node.name for i in ["to", "contiguous"]):
                     self._assgin_no_change_indice(node, idx)
+                elif "new_ones" in node.name:
+                    self._assign_ones_like_indice(node, idx)
                 else:
                     raise NotImplementedError(node.name, "method not implemented yet!")
             elif node.op == "call_function":
@@ -530,7 +534,7 @@ def trace_indice(self):
                     self._assign_matmul_indice(node, idx)
                 elif "softmax" in node.name:
                     self._assign_softmax_indice(node, idx)
-                elif any(n in node.name for n in ["mul", "add", "sigmoid", "relu"]):
+                elif any(n in node.name for n in ["mul", "add", "sigmoid", "relu", "sub", "truediv"]):
                     self._assign_elementwise_indice(node, idx)
                 elif "ones_like" in node.name:
                     self._assign_ones_like_indice(node, idx)
@@ -538,21 +542,21 @@ def trace_indice(self):
                     self._assign_dropout_indice(node, idx)
                 elif "einsum" in node.name:
                     self._assign_einsum_indice(node, idx)
-                elif "getattr" in node.name:
-                    continue  # get attr like shape
-                elif "getitem" in node.name:
-                    continue  # get item in list
+                elif "layer_norm" in node.name:
+                    self._assign_layernorm_indice(node, idx)
+                elif any(i in node.name for i in ["getattr", "getitem", "eq", "_assert"]):
+                    continue
                 else:
-                    raise NotImplementedError(
-                        node.name, "function not implemented yet!"
-                    )
+                    raise NotImplementedError(node.name, "function not implemented yet!")
             elif node.op == "call_module":
                 if any(n in node.name for n in ["layernorm", "norm"]):
                     self._assign_layernorm_indice(node, idx)
+                elif any(n in node.name for n in ["sigmoid", "dropout", "relu"]):
+                    self._assign_elementwise_indice(node, idx)
                 else:
                     raise NotImplementedError(node.name, "module not implemented yet!")
             elif node.op == "get_attr":
-                self._assign_all_indice(node, idx)  # get param
+                self._assign_all_indice(node, idx)    # get param
             elif node.op == "output":
                 continue
             else:
diff --git a/colossalai/autochunk/utils.py b/colossalai/autochunk/utils.py
index b62a6600adc8..5f3ea3bf482d 100644
--- a/colossalai/autochunk/utils.py
+++ b/colossalai/autochunk/utils.py
@@ -3,10 +3,32 @@
 from torch.fx.node import Node
 
 
+def unflat_list(inputs):
+    """
+    unflat a list by recursion
+    """
+    res = []
+    for i in inputs:
+        if isinstance(i, list) or isinstance(i, set) or isinstance(i, tuple):
+            res.extend(unflat_list(i))
+        else:
+            res.append(i)
+    return res
+
+
+def find_first_tensor_arg(node):
+    """
+    Find the first input tensor arg for a node
+    """
+    for arg in node.args:
+        if type(arg) == type(node):
+            return arg
+    raise RuntimeError()
+
+
 def is_non_compute_node(node):
     if any(i in node.op for i in ["placeholder", "get_attr", "output"]) or any(
-        i in node.name for i in ["getitem", "getattr"]
-    ):
+            i in node.name for i in ["getitem", "getattr"]):
         return True
     return False
 
@@ -18,17 +40,13 @@ def get_node_shape(node):
 
 
 def is_non_compute_node_except_placeholder(node):
-    if any(i in node.op for i in ["get_attr", "output"]) or any(
-        i in node.name for i in ["getitem", "getattr"]
-    ):
+    if any(i in node.op for i in ["get_attr", "output"]) or any(i in node.name for i in ["getitem", "getattr"]):
         return True
     return False
 
 
 def is_non_compute_node_except_placeholder_output(node):
-    if any(i in node.op for i in ["get_attr"]) or any(
-        i in node.name for i in ["getitem", "getattr"]
-    ):
+    if any(i in node.op for i in ["get_attr"]) or any(i in node.name for i in ["getitem", "getattr"]):
         return True
     return False
 
@@ -74,22 +92,16 @@ def find_chunk_compute_input_and_output_nodes(nodes: List[Node]):
     # we treat that input node as the input of the checkpoint function
     for node in nodes:
         for input_node in node._input_nodes.keys():
-            if (
-                input_node not in nodes
-                and input_node not in input_nodes
-                and not is_non_compute_node_except_placeholder(input_node)
-            ):
+            if (input_node not in nodes and input_node not in input_nodes
+                    and not is_non_compute_node_except_placeholder(input_node)):
                 input_nodes.append(input_node)
 
     # if a node has a user node which is not in the node list
     # we treat that user node as the node receiving the current node output
     for node in nodes:
         for output_node in node.users.keys():
-            if (
-                output_node not in nodes
-                and node not in output_nodes
-                and not is_non_compute_node_except_placeholder_output(output_node)
-            ):
+            if (output_node not in nodes and node not in output_nodes
+                    and not is_non_compute_node_except_placeholder_output(output_node)):
                 output_nodes.append(node)
 
     return input_nodes, output_nodes
diff --git a/colossalai/fx/profiler/opcount.py b/colossalai/fx/profiler/opcount.py
index 1c39dc247750..6bd612ad2fd1 100644
--- a/colossalai/fx/profiler/opcount.py
+++ b/colossalai/fx/profiler/opcount.py
@@ -249,6 +249,8 @@ def zero_flop_jit(*args):
         aten.sum.default,
         aten.sum.dim_IntList,
         aten.mean.dim,
+        aten.sub.Tensor,
+        aten.sub_.Tensor,
 
     # activation op
         aten.hardswish.default,
@@ -313,7 +315,8 @@ def zero_flop_jit(*args):
         aten.where.self,
         aten.zero_.default,
         aten.zeros_like.default,
-    ]
+        aten.fill_.Scalar
+    ]  # yapf: disable
 
     for op in zero_flop_aten:
         flop_mapping[op] = zero_flop_jit
diff --git a/tests/test_autochunk/benchmark_autochunk.py b/tests/test_autochunk/benchmark_simple_evoformer.py
similarity index 66%
rename from tests/test_autochunk/benchmark_autochunk.py
rename to tests/test_autochunk/benchmark_simple_evoformer.py
index 6632ece61376..8b5d8a8bee77 100644
--- a/tests/test_autochunk/benchmark_autochunk.py
+++ b/tests/test_autochunk/benchmark_simple_evoformer.py
@@ -2,14 +2,13 @@
 
 import torch
 import torch.fx
+from simple_evoformer import base_evoformer, openfold_evoformer
 
 from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
 from colossalai.fx import ColoTracer
 from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 from colossalai.fx.profiler import MetaTensor
-from tests.test_autochunk.evoformer.evoformer import evoformer_base
-from tests.test_autochunk.openfold.evoformer import EvoformerBlock
 
 
 def _benchmark_evoformer(model: torch.nn.Module, node, pair, title, chunk_size=None):
@@ -34,10 +33,7 @@ def _benchmark_evoformer(model: torch.nn.Module, node, pair, title, chunk_size=N
         time2 = time.time()
 
     new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
-    print(
-        "%s: time %.4fs, mem %dMB"
-        % (title, (time2 - time1) / loop, new_max_mem - now_mem)
-    )
+    print("%s: time %.4fs, mem %dMB" % (title, (time2 - time1) / loop, new_max_mem - now_mem))
 
 
 def _build_autochunk(model, max_memory, node, pair):
@@ -50,18 +46,14 @@ def _build_autochunk(model, max_memory, node, pair):
         },
     )
 
-    gm_prop = torch.fx.symbolic_trace(model)  # must use symbolic_trace
+    gm_prop = torch.fx.symbolic_trace(model)    # must use symbolic_trace
     interp = MetaInfoProp(gm_prop)
-    interp.propagate(
-        MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
-    )
+    interp.propagate(MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0"))
 
     # now run it twice to get meta info in graph module, not necessary
     gm = torch.fx.GraphModule(model, graph)
     interp = MetaInfoProp(gm)
-    interp.propagate(
-        MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0")
-    )
+    interp.propagate(MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0"))
 
     # set code_gen
     codegen = AutoChunkCodeGen(gm_prop, max_memory, print_mem=False)
@@ -75,42 +67,22 @@ def _build_autochunk(model, max_memory, node, pair):
     return gm
 
 
-def _build_openfold():
-    model = EvoformerBlock(
-        c_m=256,
-        c_z=128,
-        c_hidden_msa_att=32,
-        c_hidden_opm=32,
-        c_hidden_mul=128,
-        c_hidden_pair_att=32,
-        no_heads_msa=8,
-        no_heads_pair=4,
-        transition_n=4,
-        msa_dropout=0.15,
-        pair_dropout=0.15,
-        inf=1e4,
-        eps=1e-4,
-        is_multimer=False,
-    ).cuda()
-    return model
-
-
 def benchmark_evoformer():
     # init data and model
-    msa_len = 256
-    pair_len = 512
+    msa_len = 128
+    pair_len = 256
     node = torch.randn(1, msa_len, pair_len, 256).cuda()
     pair = torch.randn(1, pair_len, pair_len, 128).cuda()
-    model = evoformer_base().cuda()
+    model = base_evoformer().cuda()
 
     # build autochunk model
     # max_memory = 1000  # MB, fit memory mode
-    max_memory = None  # min memory mode
-    autochunk = _build_autochunk(evoformer_base().cuda(), max_memory, node, pair)
+    max_memory = None    # min memory mode
+    autochunk = _build_autochunk(base_evoformer().cuda(), max_memory, node, pair)
 
     # build openfold
     chunk_size = 64
-    openfold = _build_openfold()
+    openfold = openfold_evoformer().cuda()
 
     # benchmark
     _benchmark_evoformer(model, node, pair, "base")
diff --git a/tests/test_autochunk/evoformer/evoformer.py b/tests/test_autochunk/evoformer/evoformer.py
deleted file mode 100644
index cfd2bb2a2529..000000000000
--- a/tests/test_autochunk/evoformer/evoformer.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import torch
-import torch.nn as nn
-
-from .msa import MSAStack
-from .ops import OutProductMean
-from .triangle import PairStack
-
-
-def print_memory(init_mem, text=None):
-    now_mem = torch.cuda.memory_allocated() / 1024 ** 2 - init_mem
-    max_mem = torch.cuda.max_memory_allocated() / 1024 ** 2 - init_mem
-    print("%s now:%.2f max:%.2f" % ("" if text is None else text, now_mem, max_mem))
-    torch.cuda.reset_peak_memory_stats()
-
-
-class EvoformerBlock(nn.Module):
-
-    def __init__(self, d_node, d_pair):
-        super(EvoformerBlock, self).__init__()
-
-        self.msa_stack = MSAStack(d_node, d_pair, p_drop=0.15)
-        self.communication = OutProductMean(n_feat=d_node, n_feat_out=d_pair, n_feat_proj=32)
-        self.pair_stack = PairStack(d_pair=d_pair)
-
-    def forward(self, node, pair):
-        node = self.msa_stack(node, pair)
-        pair = pair + self.communication(node)
-        pair = self.pair_stack(pair)
-        return node, pair
-
-
-class Evoformer(nn.Module):
-
-    def __init__(self, d_node, d_pair):
-        super(Evoformer, self).__init__()
-
-        self.blocks = nn.ModuleList()
-        for _ in range(1):
-            self.blocks.append(EvoformerBlock(d_node, d_pair))
-
-    def forward(self, node, pair):
-        for b in self.blocks:
-            node, pair = b(node, pair)
-        return node, pair
-
-
-def evoformer_tiny():
-    return Evoformer(d_node=64, d_pair=32)
-
-
-def evoformer_base():
-    return Evoformer(d_node=256, d_pair=128)
-
-
-def evoformer_large():
-    return Evoformer(d_node=512, d_pair=256)
-
-
-__all__ = ['Evoformer', 'evoformer_base', 'evoformer_large']
diff --git a/tests/test_autochunk/evoformer/initializer.py b/tests/test_autochunk/evoformer/initializer.py
deleted file mode 100755
index c6ce0659e597..000000000000
--- a/tests/test_autochunk/evoformer/initializer.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import math
-
-import numpy as np
-import torch.nn as nn
-
-
-def glorot_uniform_af(x, gain=1.0):
-    """
-    initialize tensors the same as xavier_initializer in PyTorch, but the dimensions are different:
-    In PyTorch:
-    [feature_out, feature_in, n_head ...]
-    In Jax:
-    [... n_head, feature_in, feature_out]
-    However, there is a feature in original Alphafold2 code that they use the Jax version initializer to initialize tensors like:
-    [feature_in, n_head, feature_out]
-
-    In this function, we keep this feature to initialize [feature_in, n_head, ..., feature_out] tensors
-    """
-    fan_in, fan_out = x.shape[-2:]
-    if len(x.shape) > 2:
-        receptive_field_size = np.prod(x.shape[:-2])
-        fan_in *= receptive_field_size
-        fan_out *= receptive_field_size
-    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
-    dev = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
-
-    nn.init.uniform_(x, -dev, dev)
-
-    return x
diff --git a/tests/test_autochunk/evoformer/kernel.py b/tests/test_autochunk/evoformer/kernel.py
deleted file mode 100644
index 26ab5dc53261..000000000000
--- a/tests/test_autochunk/evoformer/kernel.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import torch
-import torch.nn.functional as F
-
-
-def bias_sigmod_ele(y, bias, z):
-    return torch.sigmoid(y + bias) * z
-
-
-def bias_dropout_add(x: torch.Tensor, bias: torch.Tensor, dropmask: torch.Tensor,
-                     residual: torch.Tensor, prob: float) -> torch.Tensor:
-    out = (x + bias) * F.dropout(dropmask, p=prob, training=False)
-    out = residual + out
-    return out
-
-
-def bias_ele_dropout_residual(ab: torch.Tensor, b: torch.Tensor, g: torch.Tensor,
-                              dropout_mask: torch.Tensor, Z_raw: torch.Tensor,
-                              prob: float) -> torch.Tensor:
-    return Z_raw + F.dropout(dropout_mask, p=prob, training=True) * (g * (ab + b))
\ No newline at end of file
diff --git a/tests/test_autochunk/evoformer/msa.py b/tests/test_autochunk/evoformer/msa.py
deleted file mode 100644
index cac456638a55..000000000000
--- a/tests/test_autochunk/evoformer/msa.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import math
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from torch.nn import LayerNorm
-
-from .kernel import bias_dropout_add
-from .ops import SelfAttention, Transition
-
-
-class MSARowAttentionWithPairBias(nn.Module):
-
-    def __init__(self, d_node, d_pair, c=32, n_head=8, p_drop=0.15):
-        super(MSARowAttentionWithPairBias, self).__init__()
-        self.d_node = d_node
-        self.d_pair = d_pair
-        self.c = c
-        self.n_head = n_head
-        self.p_drop = p_drop
-
-        self.layernormM = LayerNorm(d_node)
-        self.layernormZ = LayerNorm(d_pair)
-
-        _init_weights = torch.nn.init.normal_(torch.zeros([n_head, d_pair]),
-                                              std=1.0 / math.sqrt(d_pair))
-        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights, requires_grad=True)
-
-        self.attention = SelfAttention(qkv_dim=d_node,
-                                       c=c,
-                                       n_head=n_head,
-                                       out_dim=d_node,
-                                       gating=True,
-                                       last_bias_fuse=True)
-
-        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_node,)), requires_grad=True)
-
-    def forward(self, M_raw, Z):
-        ## Input projections
-        M = self.layernormM(M_raw)
-        Z = self.layernormZ(Z)
-        b = F.linear(Z, self.linear_b_weights)
-        b = b.permute(0, 3, 1, 2)
-        # b = rearrange(b, 'b q k h -> b h q k')
-
-        M = self.attention(M, b)
-        dropout_mask = torch.ones_like(M[:, 0:1, :, :]).to(M.device).to(M.dtype)
-
-        return bias_dropout_add(M, self.out_bias, dropout_mask, M_raw, prob=self.p_drop)
-
-
-class MSAColumnAttention(nn.Module):
-
-    def __init__(self, d_node, c=32, n_head=8):
-        super(MSAColumnAttention, self).__init__()
-        self.d_node = d_node
-        self.c = c
-        self.n_head = n_head
-
-        self.layernormM = LayerNorm(d_node)
-        self.attention = SelfAttention(qkv_dim=d_node,
-                                       c=c,
-                                       n_head=n_head,
-                                       out_dim=d_node,
-                                       gating=True)
-
-    def forward(self, M_raw):
-        M = M_raw.transpose(-2, -3)
-        M = self.layernormM(M)
-
-        M = self.attention(M)
-
-        M = M.transpose(-2, -3)
-        return M_raw + M
-
-
-class MSAStack(nn.Module):
-
-    def __init__(self, d_node, d_pair, p_drop=0.15):
-        super(MSAStack, self).__init__()
-
-        self.MSARowAttentionWithPairBias = MSARowAttentionWithPairBias(d_node=d_node,
-                                                                       d_pair=d_pair,
-                                                                       p_drop=p_drop)
-
-        self.MSAColumnAttention = MSAColumnAttention(d_node=d_node)
-        self.MSATransition = Transition(d=d_node)
-
-    def forward(self, node, pair):
-        node = self.MSARowAttentionWithPairBias(node, pair)
-        node = self.MSAColumnAttention(node)
-        node = self.MSATransition(node)
-
-        return node
diff --git a/tests/test_autochunk/evoformer/ops.py b/tests/test_autochunk/evoformer/ops.py
deleted file mode 100755
index a56057522eaa..000000000000
--- a/tests/test_autochunk/evoformer/ops.py
+++ /dev/null
@@ -1,176 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from torch.nn import LayerNorm
-
-from .initializer import glorot_uniform_af
-from .kernel import bias_sigmod_ele
-
-
-class DropoutRowwise(nn.Module):
-
-    def __init__(self, p):
-        super(DropoutRowwise, self).__init__()
-        self.p = p
-        self.dropout = nn.Dropout(p=p)
-
-    def forward(self, x):
-        dropout_mask = torch.ones_like(x[:, 0:1, :, :])
-        dropout_mask = self.dropout(dropout_mask)
-        return dropout_mask * x
-
-
-class DropoutColumnwise(nn.Module):
-
-    def __init__(self, p):
-        super(DropoutColumnwise, self).__init__()
-        self.p = p
-        self.dropout = nn.Dropout(p=p)
-
-    def forward(self, x):
-        dropout_mask = torch.ones_like(x[:, :, 0:1, :])
-        dropout_mask = self.dropout(dropout_mask)
-        return dropout_mask * x
-
-
-class Transition(nn.Module):
-
-    def __init__(self, d, n=4):
-        super(Transition, self).__init__()
-        self.norm = LayerNorm(d)
-        self.linear1 = Linear(d, n * d, initializer='relu')
-        self.linear2 = Linear(n * d, d, initializer='zeros')
-
-    def forward(self, src):
-        x = self.norm(src)
-        x = self.linear2(F.relu(self.linear1(x)))
-        return src + x
-
-
-class OutProductMean(nn.Module):
-
-    def __init__(self, n_feat=64, n_feat_out=128, n_feat_proj=32):
-        super(OutProductMean, self).__init__()
-
-        self.layernormM = LayerNorm(n_feat)
-        self.linear_a = Linear(n_feat, n_feat_proj)
-        self.linear_b = Linear(n_feat, n_feat_proj)
-
-        self.o_linear = Linear(n_feat_proj * n_feat_proj,
-                               n_feat_out,
-                               initializer='zero',
-                               use_bias=True)
-
-    def forward(self, M):
-        M = self.layernormM(M)
-        left_act = self.linear_a(M)
-        right_act = self.linear_b(M)
-
-        o = torch.einsum('bsid,bsje->bijde', left_act, right_act).contiguous()
-        # O = rearrange(O, 'b i j d e -> b i j (d e)')
-        o = o.reshape(o.shape[0], o.shape[1], o.shape[2], -1)
-        Z = self.o_linear(o)
-
-        return Z
-
-
-class Linear(nn.Linear):
-    """
-    A Linear layer with built-in nonstandard initializations. Called just
-    like torch.nn.Linear.
-    Implements the initializers in 1.11.4, plus some additional ones found
-    in the code.
-    """
-
-    def __init__(
-        self,
-        feature_in: int,
-        feature_out: int,
-        initializer: str = 'linear',
-        use_bias: bool = True,
-        bias_init: float = 0.,
-    ):
-        super(Linear, self).__init__(feature_in, feature_out, bias=use_bias)
-
-        self.use_bias = use_bias
-        if initializer == 'linear':
-            glorot_uniform_af(self.weight, gain=1.0)
-        elif initializer == 'relu':
-            glorot_uniform_af(self.weight, gain=2.0)
-        elif initializer == 'zeros':
-            nn.init.zeros_(self.weight)
-        if self.use_bias:
-            with torch.no_grad():
-                self.bias.fill_(bias_init)
-
-
-class SelfAttention(nn.Module):
-    """
-    Multi-Head SelfAttention dealing with [batch_size1, batch_size2, len, dim] tensors
-    """
-
-    def __init__(self, qkv_dim, c, n_head, out_dim, gating=True, last_bias_fuse=False):
-        super(SelfAttention, self).__init__()
-        self.qkv_dim = qkv_dim
-        self.c = c
-        self.n_head = n_head
-        self.out_dim = out_dim
-        self.gating = gating
-        self.last_bias_fuse = last_bias_fuse
-
-        self.scaling = self.c**(-0.5)
-
-        # self.to_qkv = Linear(qkv_dim, 3 * n_head * c, initializer='linear')
-        self.to_q = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
-        self.to_k = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
-        self.to_v = Linear(qkv_dim, n_head * c, initializer='linear', use_bias=False)
-
-        if gating:
-            self.gating_bias = nn.parameter.Parameter(data=torch.ones((n_head * c,)))
-            self.gating_linear = Linear(qkv_dim, n_head * c, initializer='zero', use_bias=False)
-
-        self.o_linear = Linear(n_head * c,
-                               out_dim,
-                               initializer='zero',
-                               use_bias=(not last_bias_fuse))
-
-    def forward(self, in_data, nonbatched_bias=None):
-        """
-        :param in_data: [batch_size1, batch_size2, len_qkv, qkv_dim]
-        :param bias: None or [batch_size1, batch_size2, n_head, len_q, len_kv]
-        :param nonbatched_bias: None or [batch_size1, n_head, len_q, len_kv]
-        """
-
-        # qkv = self.to_qkv(in_data).chunk(3, dim=-1)
-        # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head), qkv)
-
-        q = self.to_q(in_data)
-        k = self.to_k(in_data)
-        v = self.to_v(in_data)
-
-        # q, k, v = map(lambda t: rearrange(t, 'b1 b2 n (h d) -> b1 b2 h n d', h=self.n_head),
-        #               [q, k, v])
-        q, k, v = map(lambda t: t.view(t.shape[0], t.shape[1], t.shape[2], self.n_head, -1).permute(0, 1, 3, 2, 4),
-                      [q, k, v])
-        
-        q = q * self.scaling
-
-        logits = torch.matmul(q, k.transpose(-1, -2))
-
-        if nonbatched_bias is not None:
-            logits += nonbatched_bias.unsqueeze(1)
-        weights = torch.softmax(logits, dim=-1)
-        # weights = softmax(logits)
-
-        weighted_avg = torch.matmul(weights, v)
-        # weighted_avg = rearrange(weighted_avg, 'b1 b2 h n d -> b1 b2 n (h d)')
-        weighted_avg = weighted_avg.permute(0, 1, 3, 2, 4)
-        weighted_avg = weighted_avg.reshape(weighted_avg.shape[0], weighted_avg.shape[1], weighted_avg.shape[2], -1)
-
-        if self.gating:
-            gate_values = self.gating_linear(in_data)
-            weighted_avg = bias_sigmod_ele(gate_values, self.gating_bias, weighted_avg)
-
-        output = self.o_linear(weighted_avg)
-        return output
diff --git a/tests/test_autochunk/evoformer/triangle.py b/tests/test_autochunk/evoformer/triangle.py
deleted file mode 100644
index f479469c3836..000000000000
--- a/tests/test_autochunk/evoformer/triangle.py
+++ /dev/null
@@ -1,192 +0,0 @@
-import math
-
-import torch
-import torch.nn as nn
-from torch.nn import LayerNorm
-
-from .kernel import bias_dropout_add, bias_ele_dropout_residual
-from .ops import Linear, SelfAttention, Transition
-
-
-def permute_final_dims(tensor, inds):
-    zero_index = -1 * len(inds)
-    first_inds = list(range(len(tensor.shape[:zero_index])))
-    return tensor.permute(first_inds + [zero_index + i for i in inds])
-
-
-class TriangleMultiplicationOutgoing(nn.Module):
-
-    def __init__(self, d_pair, p_drop, c=128):
-        super(TriangleMultiplicationOutgoing, self).__init__()
-        self.d_pair = d_pair
-        self.c = c
-
-        self.layernorm1 = LayerNorm(d_pair)
-        self.left_projection = Linear(d_pair, c)
-        self.right_projection = Linear(d_pair, c)
-        self.left_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
-        self.right_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
-
-        self.output_gate = Linear(d_pair, d_pair, initializer='zeros', bias_init=1.)
-        self.layernorm2 = LayerNorm(c)
-        self.output_projection = Linear(d_pair, d_pair, initializer='zeros', use_bias=False)
-        self.output_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
-
-        self.p_drop = p_drop
-
-    def forward(self, Z_raw):
-        Z = self.layernorm1(Z_raw)
-        left_proj_act = self.left_projection(Z)
-        right_proj_act = self.right_projection(Z)
-
-        left_proj_act = left_proj_act * torch.sigmoid(self.left_gate(Z))
-        right_proj_act = right_proj_act * torch.sigmoid(self.right_gate(Z))
-
-        g = torch.sigmoid(self.output_gate(Z))
-        # p = torch.matmul(
-        #     permute_final_dims(left_proj_act, (2, 0, 1)),
-        #     permute_final_dims(right_proj_act, (2, 1, 0)),
-        # )
-        # ab = permute_final_dims(p, (1, 2, 0))
-
-        ab = torch.einsum('bikd,bjkd->bijd', left_proj_act, right_proj_act)
-        ab = self.output_projection(self.layernorm2(ab))
-        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
-        return bias_ele_dropout_residual(ab,
-                                         self.output_bias,
-                                         g,
-                                         dropout_mask,
-                                         Z_raw,
-                                         prob=self.p_drop)
-
-
-class TriangleMultiplicationIncoming(nn.Module):
-
-    def __init__(self, d_pair, p_drop, c=128):
-        super(TriangleMultiplicationIncoming, self).__init__()
-        self.d_pair = d_pair
-        self.c = c
-
-        self.layernorm1 = LayerNorm(d_pair)
-        self.left_projection = Linear(d_pair, c)
-        self.right_projection = Linear(d_pair, c)
-        self.left_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
-        self.right_gate = Linear(d_pair, c, initializer='zeros', bias_init=1.)
-
-        self.output_gate = Linear(d_pair, d_pair, initializer='zeros', bias_init=1.)
-        self.layernorm2 = LayerNorm(c)
-        self.output_projection = Linear(d_pair, d_pair, initializer='zeros', use_bias=False)
-        self.output_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
-
-        self.p_drop = p_drop
-
-    def forward(self, Z_raw):
-        Z = self.layernorm1(Z_raw)
-        left_proj_act = self.left_projection(Z)
-        right_proj_act = self.right_projection(Z)
-
-        left_proj_act = left_proj_act * torch.sigmoid(self.left_gate(Z))
-        right_proj_act = right_proj_act * torch.sigmoid(self.right_gate(Z))
-
-        g = torch.sigmoid(self.output_gate(Z))
-        # p = torch.matmul(
-        #     permute_final_dims(left_proj_act, (2, 1, 0)),
-        #     permute_final_dims(right_proj_act, (2, 0, 1)),
-        # )
-        # ab = permute_final_dims(p, (1, 2, 0))
-
-        ab = torch.einsum('bkid,bkjd->bijd', left_proj_act, right_proj_act)
-        ab = self.output_projection(self.layernorm2(ab))
-        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
-        return bias_ele_dropout_residual(ab,
-                                         self.output_bias,
-                                         g,
-                                         dropout_mask,
-                                         Z_raw,
-                                         prob=self.p_drop)
-
-
-class TriangleAttentionStartingNode(nn.Module):
-
-    def __init__(self, d_pair, p_drop, c=32, n_head=4):
-        super(TriangleAttentionStartingNode, self).__init__()
-        self.d_pair = d_pair
-        self.c = c
-        self.n_head = n_head
-        self.p_drop = p_drop
-
-        self.layernorm1 = LayerNorm(d_pair)
-        _init_weights = torch.nn.init.normal_(torch.zeros([d_pair, n_head]),
-                                              std=1.0 / math.sqrt(d_pair))
-        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights)
-        self.attention = SelfAttention(qkv_dim=d_pair,
-                                       c=c,
-                                       n_head=n_head,
-                                       out_dim=d_pair,
-                                       gating=True,
-                                       last_bias_fuse=True)
-
-        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
-
-    def forward(self, Z_raw):
-        Z = self.layernorm1(Z_raw)
-        b = torch.einsum('bqkc,ch->bhqk', Z, self.linear_b_weights)
-
-        Z = self.attention(Z, b)
-
-        dropout_mask = torch.ones_like(Z[:, 0:1, :, :]).to(Z.device).to(Z.dtype)
-        return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
-
-
-class TriangleAttentionEndingNode(nn.Module):
-
-    def __init__(self, d_pair, p_drop, c=32, n_head=4):
-        super(TriangleAttentionEndingNode, self).__init__()
-        self.d_pair = d_pair
-        self.c = c
-        self.n_head = n_head
-        self.p_drop = p_drop
-
-        self.layernorm1 = LayerNorm(d_pair)
-        _init_weights = torch.nn.init.normal_(torch.zeros([d_pair, n_head]),
-                                              std=1.0 / math.sqrt(d_pair))
-        self.linear_b_weights = nn.parameter.Parameter(data=_init_weights)
-        self.attention = SelfAttention(qkv_dim=d_pair,
-                                       c=c,
-                                       n_head=n_head,
-                                       out_dim=d_pair,
-                                       gating=True,
-                                       last_bias_fuse=True)
-
-        self.out_bias = nn.parameter.Parameter(data=torch.zeros((d_pair,)), requires_grad=True)
-
-    def forward(self, Z_raw):
-        Z = Z_raw.transpose(-2, -3)
-        Z = self.layernorm1(Z)
-        b = torch.einsum('bqkc,ch->bhqk', Z, self.linear_b_weights)
-
-        Z = self.attention(Z, b)
-
-        Z = Z.transpose(-2, -3)
-        dropout_mask = torch.ones_like(Z[:, :, 0:1, :]).to(Z.device).to(Z.dtype)
-        return bias_dropout_add(Z, self.out_bias, dropout_mask, Z_raw, prob=self.p_drop)
-
-
-class PairStack(nn.Module):
-
-    def __init__(self, d_pair, p_drop=0.25):
-        super(PairStack, self).__init__()
-
-        self.TriangleMultiplicationOutgoing = TriangleMultiplicationOutgoing(d_pair, p_drop=p_drop)
-        self.TriangleMultiplicationIncoming = TriangleMultiplicationIncoming(d_pair, p_drop=p_drop)
-        self.TriangleAttentionStartingNode = TriangleAttentionStartingNode(d_pair, p_drop=p_drop)
-        self.TriangleAttentionEndingNode = TriangleAttentionEndingNode(d_pair, p_drop=p_drop)
-        self.PairTransition = Transition(d=d_pair)
-
-    def forward(self, pair):
-        pair = self.TriangleMultiplicationOutgoing(pair)
-        pair = self.TriangleMultiplicationIncoming(pair)
-        pair = self.TriangleAttentionStartingNode(pair)
-        pair = self.TriangleAttentionEndingNode(pair)
-        pair = self.PairTransition(pair)
-        return pair
diff --git a/tests/test_autochunk/openfold/checkpointing.py b/tests/test_autochunk/openfold/checkpointing.py
deleted file mode 100644
index 83e77c638ec1..000000000000
--- a/tests/test_autochunk/openfold/checkpointing.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright 2021 AlQuraishi Laboratory
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.utils.checkpoint
-from typing import Any, Tuple, List, Callable, Optional
-
-
-BLOCK_ARG = Any
-BLOCK_ARGS = List[BLOCK_ARG]
-
-
-def get_checkpoint_fn():
-    checkpoint = torch.utils.checkpoint.checkpoint
-
-    return checkpoint
-
-
-@torch.jit.ignore
-def checkpoint_blocks(
-    blocks: List[Callable],
-    args: BLOCK_ARGS,
-    blocks_per_ckpt: Optional[int],
-) -> BLOCK_ARGS:
-    """
-    Chunk a list of blocks and run each chunk with activation
-    checkpointing. We define a "block" as a callable whose only inputs are
-    the outputs of the previous block.
-
-    Implements Subsection 1.11.8
-
-    Args:
-        blocks:
-            List of blocks
-        args:
-            Tuple of arguments for the first block.
-        blocks_per_ckpt:
-            Size of each chunk. A higher value corresponds to fewer 
-            checkpoints, and trades memory for speed. If None, no checkpointing 
-            is performed.
-    Returns:
-        The output of the final block
-    """
-    def wrap(a):
-        return (a,) if type(a) is not tuple else a
-
-    def exec(b, a):
-        for block in b:
-            a = wrap(block(*a))
-        return a
-
-    def chunker(s, e):
-        def exec_sliced(*a):
-            return exec(blocks[s:e], a)
-
-        return exec_sliced
-
-    # Avoids mishaps when the blocks take just one argument
-    args = wrap(args)
-
-    if blocks_per_ckpt is None:
-        return exec(blocks, args)
-    elif blocks_per_ckpt < 1 or blocks_per_ckpt > len(blocks):
-        raise ValueError("blocks_per_ckpt must be between 1 and len(blocks)")
-
-    checkpoint = get_checkpoint_fn() 
-
-    for s in range(0, len(blocks), blocks_per_ckpt):
-        e = s + blocks_per_ckpt
-        args = checkpoint(chunker(s, e), *args)
-        args = wrap(args)
-
-    return args
diff --git a/tests/test_autochunk/openfold/dropout.py b/tests/test_autochunk/openfold/dropout.py
deleted file mode 100644
index 651b9775ef44..000000000000
--- a/tests/test_autochunk/openfold/dropout.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright 2021 AlQuraishi Laboratory
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import torch
-import torch.nn as nn
-from functools import partialmethod
-from typing import Union, List
-
-
-class Dropout(nn.Module):
-    """
-    Implementation of dropout with the ability to share the dropout mask
-    along a particular dimension.
-
-    If not in training mode, this module computes the identity function.
-    """
-
-    def __init__(self, r: float, batch_dim: Union[int, List[int]]):
-        """
-        Args:
-            r:
-                Dropout rate
-            batch_dim:
-                Dimension(s) along which the dropout mask is shared
-        """
-        super(Dropout, self).__init__()
-
-        self.r = r
-        if type(batch_dim) == int:
-            batch_dim = [batch_dim]
-        self.batch_dim = batch_dim
-        self.dropout = nn.Dropout(self.r)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            x:
-                Tensor to which dropout is applied. Can have any shape
-                compatible with self.batch_dim
-        """
-        shape = list(x.shape)
-        if self.batch_dim is not None:
-            for bd in self.batch_dim:
-                shape[bd] = 1
-        mask = x.new_ones(shape)
-        mask = self.dropout(mask)
-        x *= mask
-        return x
-
-
-class DropoutRowwise(Dropout):
-    """
-    Convenience class for rowwise dropout as described in subsection
-    1.11.6.
-    """
-
-    __init__ = partialmethod(Dropout.__init__, batch_dim=-3)
-
-
-class DropoutColumnwise(Dropout):
-    """
-    Convenience class for columnwise dropout as described in subsection
-    1.11.6.
-    """
-
-    __init__ = partialmethod(Dropout.__init__, batch_dim=-2)
diff --git a/tests/test_autochunk/openfold/evoformer.py b/tests/test_autochunk/openfold/evoformer.py
deleted file mode 100644
index b53ec1aa51e5..000000000000
--- a/tests/test_autochunk/openfold/evoformer.py
+++ /dev/null
@@ -1,431 +0,0 @@
-# Copyright 2021 AlQuraishi Laboratory
-# Copyright 2021 DeepMind Technologies Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import torch
-import torch.nn as nn
-from typing import Tuple, Optional
-from functools import partial
-
-from .primitives import Linear, LayerNorm
-from .dropout import DropoutRowwise, DropoutColumnwise
-from .msa import (
-    MSARowAttentionWithPairBias,
-    MSAColumnAttention,
-    MSAColumnGlobalAttention,
-)
-from .outer_product_mean import OuterProductMean
-from .pair_transition import PairTransition
-from .triangular_attention import (
-    TriangleAttentionStartingNode,
-    TriangleAttentionEndingNode,
-)
-from .triangular_multiplicative_update import (
-    TriangleMultiplicationOutgoing,
-    TriangleMultiplicationIncoming,
-)
-from .checkpointing import checkpoint_blocks, get_checkpoint_fn
-from .tensor_utils import chunk_layer
-
-
-class MSATransition(nn.Module):
-    """
-    Feed-forward network applied to MSA activations after attention.
-
-    Implements Algorithm 9
-    """
-    def __init__(self, c_m, n):
-        """
-        Args:
-            c_m:
-                MSA channel dimension
-            n:
-                Factor multiplied to c_m to obtain the hidden channel
-                dimension
-        """
-        super(MSATransition, self).__init__()
-
-        self.c_m = c_m
-        self.n = n
-
-        self.layer_norm = LayerNorm(self.c_m)
-        self.linear_1 = Linear(self.c_m, self.n * self.c_m, init="relu")
-        self.relu = nn.ReLU()
-        self.linear_2 = Linear(self.n * self.c_m, self.c_m, init="final")
-
-    def _transition(self, m, mask):
-        m = self.linear_1(m)
-        m = self.relu(m)
-        m = self.linear_2(m) * mask
-        return m
-
-    @torch.jit.ignore
-    def _chunk(self,
-        m: torch.Tensor,
-        mask: torch.Tensor,
-        chunk_size: int,
-    ) -> torch.Tensor:
-         return chunk_layer(
-             self._transition,
-             {"m": m, "mask": mask},
-             chunk_size=chunk_size,
-             no_batch_dims=len(m.shape[:-2]),
-         )
-
-    def forward(
-        self,
-        m: torch.Tensor,
-        mask: Optional[torch.Tensor] = None,
-        chunk_size: Optional[int] = None,
-    ) -> torch.Tensor:
-        """
-        Args:
-            m:
-                [*, N_seq, N_res, C_m] MSA activation
-            mask:
-                [*, N_seq, N_res, C_m] MSA mask
-        Returns:
-            m:
-                [*, N_seq, N_res, C_m] MSA activation update
-        """
-
-        # DISCREPANCY: DeepMind forgets to apply the MSA mask here.
-        if mask is None:
-            mask = m.new_ones(m.shape[:-1])
-
-        # [*, N_seq, N_res, 1]
-        mask = mask.unsqueeze(-1)
-
-        m = self.layer_norm(m)
-
-        if chunk_size is not None:
-            m = self._chunk(m, mask, chunk_size)
-        else:
-            m = self._transition(m, mask)
-
-        return m
-
-
-class EvoformerBlockCore(nn.Module):
-    def __init__(
-        self,
-        c_m: int,
-        c_z: int,
-        c_hidden_opm: int,
-        c_hidden_mul: int,
-        c_hidden_pair_att: int,
-        no_heads_msa: int,
-        no_heads_pair: int,
-        transition_n: int,
-        pair_dropout: float,
-        inf: float,
-        eps: float,
-        _is_extra_msa_stack: bool = False,
-        is_multimer: bool = False,
-    ):
-        super(EvoformerBlockCore, self).__init__()
-        self.is_multimer = is_multimer
-        self.msa_transition = MSATransition(
-            c_m=c_m,
-            n=transition_n,
-        )
-
-        self.outer_product_mean = OuterProductMean(
-            c_m,
-            c_z,
-            c_hidden_opm,
-        )
-
-        self.tri_mul_out = TriangleMultiplicationOutgoing(
-            c_z,
-            c_hidden_mul,
-        )
-        self.tri_mul_in = TriangleMultiplicationIncoming(
-            c_z,
-            c_hidden_mul,
-        )
-
-        self.tri_att_start = TriangleAttentionStartingNode(
-            c_z,
-            c_hidden_pair_att,
-            no_heads_pair,
-            inf=inf,
-        )
-        self.tri_att_end = TriangleAttentionEndingNode(
-            c_z,
-            c_hidden_pair_att,
-            no_heads_pair,
-            inf=inf,
-        )
-
-        self.pair_transition = PairTransition(
-            c_z,
-            transition_n,
-        )
-
-        self.ps_dropout_row_layer = DropoutRowwise(pair_dropout)
-        self.ps_dropout_col_layer = DropoutColumnwise(pair_dropout)
-
-    def forward(
-        self,
-        m: torch.Tensor,
-        z: torch.Tensor,
-        chunk_size: Optional[int] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]: 
-        # DeepMind doesn't mask these transitions in the source, so _mask_trans
-        # should be disabled to better approximate the exact activations of
-        # the original.
-
-        m = m + self.msa_transition(
-            m, chunk_size=chunk_size
-        )
-        z = z + self.outer_product_mean(
-            m, chunk_size=chunk_size
-        )
-        z = z + self.ps_dropout_row_layer(self.tri_mul_out(z))
-        z = z + self.ps_dropout_row_layer(self.tri_mul_in(z))
-        z = z + self.ps_dropout_row_layer(
-            self.tri_att_start(z, chunk_size=chunk_size)
-        )
-        z = z + self.ps_dropout_col_layer(
-            self.tri_att_end(z, chunk_size=chunk_size)
-        )
-        z = z + self.pair_transition(
-            z, chunk_size=chunk_size
-        )
-
-        return m, z
-
-
-class EvoformerBlock(nn.Module):
-    def __init__(self,
-        c_m: int,
-        c_z: int,
-        c_hidden_msa_att: int,
-        c_hidden_opm: int,
-        c_hidden_mul: int,
-        c_hidden_pair_att: int,
-        no_heads_msa: int,
-        no_heads_pair: int,
-        transition_n: int,
-        msa_dropout: float,
-        pair_dropout: float,
-        inf: float,
-        eps: float,
-        is_multimer: bool,
-    ):
-        super(EvoformerBlock, self).__init__()
-
-        self.msa_att_row = MSARowAttentionWithPairBias(
-            c_m=c_m,
-            c_z=c_z,
-            c_hidden=c_hidden_msa_att,
-            no_heads=no_heads_msa,
-            inf=inf,
-        )
-
-        self.msa_att_col = MSAColumnAttention(
-            c_m,
-            c_hidden_msa_att,
-            no_heads_msa,
-            inf=inf,
-        )
-
-        self.msa_dropout_layer = DropoutRowwise(msa_dropout)
-
-        self.core = EvoformerBlockCore(
-            c_m=c_m,
-            c_z=c_z,
-            c_hidden_opm=c_hidden_opm,
-            c_hidden_mul=c_hidden_mul,
-            c_hidden_pair_att=c_hidden_pair_att,
-            no_heads_msa=no_heads_msa,
-            no_heads_pair=no_heads_pair,
-            transition_n=transition_n,
-            pair_dropout=pair_dropout,
-            inf=inf,
-            eps=eps,
-        )
-        
-        self.outer_product_mean = OuterProductMean(
-            c_m,
-            c_z,
-            c_hidden_opm,
-        )
-        self.is_multimer = is_multimer
-
-    def forward(self,
-        m: torch.Tensor,
-        z: torch.Tensor,
-        chunk_size: Optional[int] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        m = m + self.msa_dropout_layer(
-            self.msa_att_row(m, z=z, chunk_size=chunk_size)
-        )
-        m = m + self.msa_att_col(m, chunk_size=chunk_size)
-        m, z = self.core(
-            m, 
-            z, 
-            chunk_size=chunk_size, 
-        )
-
-        return m, z
-
-
-class EvoformerStack(nn.Module):
-    """
-    Main Evoformer trunk.
-
-    Implements Algorithm 6.
-    """
-
-    def __init__(
-        self,
-        c_m: int,
-        c_z: int,
-        c_hidden_msa_att: int,
-        c_hidden_opm: int,
-        c_hidden_mul: int,
-        c_hidden_pair_att: int,
-        c_s: int,
-        no_heads_msa: int,
-        no_heads_pair: int,
-        no_blocks: int,
-        transition_n: int,
-        msa_dropout: float,
-        pair_dropout: float,
-        blocks_per_ckpt: int,
-        inf: float,
-        eps: float,
-        clear_cache_between_blocks: bool = False, 
-        is_multimer: bool = False,
-        **kwargs,
-    ):
-        """
-        Args:
-            c_m:
-                MSA channel dimension
-            c_z:
-                Pair channel dimension
-            c_hidden_msa_att:
-                Hidden dimension in MSA attention
-            c_hidden_opm:
-                Hidden dimension in outer product mean module
-            c_hidden_mul:
-                Hidden dimension in multiplicative updates
-            c_hidden_pair_att:
-                Hidden dimension in triangular attention
-            c_s:
-                Channel dimension of the output "single" embedding
-            no_heads_msa:
-                Number of heads used for MSA attention
-            no_heads_pair:
-                Number of heads used for pair attention
-            no_blocks:
-                Number of Evoformer blocks in the stack
-            transition_n:
-                Factor by which to multiply c_m to obtain the MSATransition
-                hidden dimension
-            msa_dropout:
-                Dropout rate for MSA activations
-            pair_dropout:
-                Dropout used for pair activations
-            blocks_per_ckpt:
-                Number of Evoformer blocks in each activation checkpoint
-            clear_cache_between_blocks:
-                Whether to clear CUDA's GPU memory cache between blocks of the
-                stack. Slows down each block but can reduce fragmentation
-        """
-        super(EvoformerStack, self).__init__()
-
-        self.blocks_per_ckpt = blocks_per_ckpt
-        self.clear_cache_between_blocks = clear_cache_between_blocks
-
-        self.blocks = nn.ModuleList()
-
-        for _ in range(no_blocks):
-            block = EvoformerBlock(
-                c_m=c_m,
-                c_z=c_z,
-                c_hidden_msa_att=c_hidden_msa_att,
-                c_hidden_opm=c_hidden_opm,
-                c_hidden_mul=c_hidden_mul,
-                c_hidden_pair_att=c_hidden_pair_att,
-                no_heads_msa=no_heads_msa,
-                no_heads_pair=no_heads_pair,
-                transition_n=transition_n,
-                msa_dropout=msa_dropout,
-                pair_dropout=pair_dropout,
-                inf=inf,
-                eps=eps,
-                is_multimer=is_multimer,
-            )
-            self.blocks.append(block)
-
-        self.linear = Linear(c_m, c_s)
-
-    def forward(self,
-        m: torch.Tensor,
-        z: torch.Tensor,
-        msa_mask: torch.Tensor,
-        pair_mask: torch.Tensor,
-        chunk_size: int,
-        _mask_trans: bool = True,
-    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-        """
-        Args:
-            m:
-                [*, N_seq, N_res, C_m] MSA embedding
-            z:
-                [*, N_res, N_res, C_z] pair embedding
-            msa_mask:
-                [*, N_seq, N_res] MSA mask
-            pair_mask:
-                [*, N_res, N_res] pair mask
-        Returns:
-            m:
-                [*, N_seq, N_res, C_m] MSA embedding
-            z:
-                [*, N_res, N_res, C_z] pair embedding
-            s:
-                [*, N_res, C_s] single embedding (or None if extra MSA stack)
-        """
-        blocks = [
-            partial(
-                b,
-                msa_mask=msa_mask,
-                pair_mask=pair_mask,
-                chunk_size=chunk_size,
-                _mask_trans=_mask_trans,
-            )
-            for b in self.blocks
-        ]
-
-        if(self.clear_cache_between_blocks):
-            def block_with_cache_clear(block, *args):
-                torch.cuda.empty_cache()
-                return block(*args)
-
-            blocks = [partial(block_with_cache_clear, b) for b in blocks]
-
-        m, z = checkpoint_blocks(
-            blocks,
-            args=(m, z),
-            blocks_per_ckpt=self.blocks_per_ckpt if self.training else None,
-        )
-
-        s = self.linear(m[..., 0, :, :])
-        
-        return m, z, s
diff --git a/tests/test_autochunk/openfold/msa.py b/tests/test_autochunk/openfold/msa.py
deleted file mode 100644
index 7c137286feab..000000000000
--- a/tests/test_autochunk/openfold/msa.py
+++ /dev/null
@@ -1,331 +0,0 @@
-# Copyright 2021 AlQuraishi Laboratory
-# Copyright 2021 DeepMind Technologies Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import torch
-import torch.nn as nn
-from typing import Optional, List, Tuple
-
-from .primitives import (
-    Linear, 
-    LayerNorm,
-    Attention, 
-    GlobalAttention, 
-    _attention_chunked_trainable,
-)
-from .checkpointing import get_checkpoint_fn
-from .tensor_utils import (
-    chunk_layer,
-    permute_final_dims,
-    flatten_final_dims,
-)
-
-
-class MSAAttention(nn.Module):
-    def __init__(
-        self,
-        c_in,
-        c_hidden,
-        no_heads,
-        pair_bias=False,
-        c_z=None,
-        inf=1e9,
-    ):
-        """
-        Args:
-            c_in:
-                Input channel dimension
-            c_hidden:
-                Per-head hidden channel dimension
-            no_heads:
-                Number of attention heads
-            pair_bias:
-                Whether to use pair embedding bias
-            c_z:
-                Pair embedding channel dimension. Ignored unless pair_bias
-                is true
-            inf:
-                A large number to be used in computing the attention mask
-        """
-        super(MSAAttention, self).__init__()
-
-        self.c_in = c_in
-        self.c_hidden = c_hidden
-        self.no_heads = no_heads
-        self.pair_bias = pair_bias
-        self.c_z = c_z
-        self.inf = inf
-
-        self.layer_norm_m = LayerNorm(self.c_in)
-
-        self.layer_norm_z = None
-        self.linear_z = None
-        if self.pair_bias:
-            self.layer_norm_z = LayerNorm(self.c_z)
-            self.linear_z = Linear(
-                self.c_z, self.no_heads, bias=False, init="normal"
-            )
-        
-        self.mha = Attention(
-            self.c_in, self.c_in, self.c_in, self.c_hidden, self.no_heads
-        )
-
-    @torch.jit.ignore
-    def _chunk(self, 
-        m: torch.Tensor,
-        biases: List[torch.Tensor],
-        chunk_size: int,
-    ) -> torch.Tensor:
-        return chunk_layer(
-            self.mha,
-            {"q_x": m, "kv_x": m, "biases": biases},
-            chunk_size=chunk_size,
-            no_batch_dims=len(m.shape[:-2]),
-        )
-
-    def _prep_inputs(self,
-        m: torch.Tensor,
-        z: Optional[torch.Tensor],
-        mask: Optional[torch.Tensor]
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        # [*, N_seq, N_res, C_m]
-        m = self.layer_norm_m(m)
-
-        n_seq, n_res = m.shape[-3:-1]
-        if mask is None:
-            # [*, N_seq, N_res]
-            mask = m.new_ones(
-                m.shape[:-3] + (n_seq, n_res),
-            )
-
-        # [*, N_seq, 1, 1, N_res]
-        mask_bias = (self.inf * (mask - 1))[..., :, None, None, :]
-
-        # This step simply returns a larger view of the bias, and does not
-        # consume additional memory.
-        # [*, N_seq, no_heads, N_res, N_res]
-        #bias = bias.expand(
-        #    ((-1,) * len(bias.shape[:-4])) + (-1, self.no_heads, n_res, -1)
-        #)
-
-        if (self.pair_bias and 
-            z is not None and                       # For the 
-            self.layer_norm_z is not None and       # benefit of
-            self.linear_z is not None               # TorchScript
-        ):
-            # [*, N_res, N_res, C_z]
-            z = self.layer_norm_z(z)
-            
-            # [*, N_res, N_res, no_heads]
-            z = self.linear_z(z)
-            
-            # [*, 1, no_heads, N_res, N_res]
-            z = permute_final_dims(z, (2, 0, 1)).unsqueeze(-4)
-
-        return m, mask_bias, z
-
-
-    def forward(self, 
-        m: torch.Tensor, 
-        z: Optional[torch.Tensor] = None, 
-        mask: Optional[torch.Tensor] = None, 
-        chunk_size: Optional[int] = None,
-        _chunk_logits: Optional[int] = None,
-        _checkpoint_chunks: Optional[bool] = None,
-    ) -> torch.Tensor:
-        """
-        Args:
-            m:
-                [*, N_seq, N_res, C_m] MSA embedding
-            z:
-                [*, N_res, N_res, C_z] pair embedding. Required only if
-                pair_bias is True
-            mask:
-                [*, N_seq, N_res] MSA mask
-            chunk_size:
-                Size of chunks into which the inputs are split along their
-                batch dimensions. A low value decreases memory overhead at the 
-                cost of slower execution. Chunking is not performed by default.
-                
-        """
-        m, mask_bias, z = self._prep_inputs(m, z, mask)
-
-        biases = [mask_bias]
-        if(z is not None):
-            biases.append(z)
-
-        if chunk_size is not None:
-            m = self._chunk(m, biases, chunk_size)
-        else:
-            m = self.mha(
-                q_x=m, 
-                kv_x=m, 
-                biases=biases 
-            )
-
-        return m
-
-
-class MSARowAttentionWithPairBias(MSAAttention):
-    """
-    Implements Algorithm 7.
-    """
-
-    def __init__(self, c_m, c_z, c_hidden, no_heads, inf=1e9):
-        """
-        Args:
-            c_m:
-                Input channel dimension
-            c_z:
-                Pair embedding channel dimension
-            c_hidden:
-                Per-head hidden channel dimension
-            no_heads:
-                Number of attention heads
-            inf:
-                Large number used to construct attention masks
-        """
-        super(MSARowAttentionWithPairBias, self).__init__(
-            c_m,
-            c_hidden,
-            no_heads,
-            pair_bias=True,
-            c_z=c_z,
-            inf=inf,
-        )
-
-
-class MSAColumnAttention(nn.Module):
-    """
-    Implements Algorithm 8.
-
-    By rights, this should also be a subclass of MSAAttention. Alas,
-    most inheritance isn't supported by TorchScript.
-    """
-
-    def __init__(self, c_m, c_hidden, no_heads, inf=1e9):
-        """
-        Args:
-            c_m:
-                MSA channel dimension
-            c_hidden:
-                Per-head hidden channel dimension
-            no_heads:
-                Number of attention heads
-            inf:
-                Large number used to construct attention masks
-        """
-        super(MSAColumnAttention, self).__init__()
-        
-        self.c_m = c_m
-        self.c_hidden = c_hidden
-        self.no_heads = no_heads
-        self.inf = inf
-
-        self._msa_att = MSAAttention(
-            c_in=c_m,
-            c_hidden=c_hidden,
-            no_heads=no_heads,
-            pair_bias=False,
-            c_z=None,
-            inf=inf,
-        )
-
-    def forward(self, 
-        m: torch.Tensor, 
-        mask: Optional[torch.Tensor] = None, 
-        chunk_size: Optional[int] = None
-    ) -> torch.Tensor:
-        """
-        Args:
-            m:
-                [*, N_seq, N_res, C_m] MSA embedding
-            mask:
-                [*, N_seq, N_res] MSA mask
-            chunk_size:
-                Size of chunks into which the inputs are split along their
-                batch dimensions. A low value decreases memory overhead at the 
-                cost of slower execution. Chunking is not performed by default.
-        """ 
-        # [*, N_res, N_seq, C_in]
-        m = m.transpose(-2, -3)
-
-        m = self._msa_att(m, chunk_size=chunk_size)
-
-        # [*, N_seq, N_res, C_in]
-        m = m.transpose(-2, -3)
-
-        return m
-
-
-class MSAColumnGlobalAttention(nn.Module):
-    def __init__(
-        self, c_in, c_hidden, no_heads, inf=1e9, eps=1e-10,
-    ):
-        super(MSAColumnGlobalAttention, self).__init__()
-
-        self.c_in = c_in
-        self.c_hidden = c_hidden
-        self.no_heads = no_heads
-        self.inf = inf
-        self.eps = eps
-
-        self.layer_norm_m = nn.LayerNorm(c_in)
-
-        self.global_attention = GlobalAttention(
-            c_in=c_in,
-            c_hidden=c_hidden,
-            no_heads=no_heads,
-            inf=inf,
-            eps=eps,
-        )
-
-    @torch.jit.ignore
-    def _chunk(self,
-        m: torch.Tensor,
-        chunk_size: int,
-    ) -> torch.Tensor:
-        mha_input = {
-            "m": m,
-        }
-        return chunk_layer(
-            self.global_attention,
-            mha_input,
-            chunk_size=chunk_size,
-            no_batch_dims=len(m.shape[:-2]),
-        )
-
-    def forward(
-        self, 
-        m: torch.Tensor, 
-        chunk_size: Optional[int] = None,
-    ) -> torch.Tensor:
-        n_seq, n_res, c_in = m.shape[-3:]
-
-        # [*, N_res, N_seq, C_in]
-        m = m.transpose(-2, -3)
-
-        # [*, N_res, N_seq, C_in]
-        m = self.layer_norm_m(m)
-
-        if chunk_size is not None:
-            m = self._chunk(m, chunk_size) 
-        else:
-            m = self.global_attention(m=m)
-
-        # [*, N_seq, N_res, C_in]
-        m = m.transpose(-2, -3)
-
-        return m
diff --git a/tests/test_autochunk/openfold/outer_product_mean.py b/tests/test_autochunk/openfold/outer_product_mean.py
deleted file mode 100644
index daadf1c272cf..000000000000
--- a/tests/test_autochunk/openfold/outer_product_mean.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright 2021 AlQuraishi Laboratory
-# Copyright 2021 DeepMind Technologies Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from functools import partial
-from typing import Optional
-
-import torch
-import torch.nn as nn
-
-from .primitives import Linear
-from .tensor_utils import chunk_layer
-
-
-class OuterProductMean(nn.Module):
-    """
-    Implements Algorithm 10.
-    """
-
-    def __init__(self, c_m, c_z, c_hidden, eps=1e-3):
-        """
-        Args:
-            c_m:
-                MSA embedding channel dimension
-            c_z:
-                Pair embedding channel dimension
-            c_hidden:
-                Hidden channel dimension
-        """
-        super(OuterProductMean, self).__init__()
-
-        self.c_m = c_m
-        self.c_z = c_z
-        self.c_hidden = c_hidden
-        self.eps = eps
-
-        self.layer_norm = nn.LayerNorm(c_m)
-        self.linear_1 = Linear(c_m, c_hidden)
-        self.linear_2 = Linear(c_m, c_hidden)
-        self.linear_out = Linear(c_hidden ** 2, c_z, init="final")
-
-    def _opm(self, a, b):
-        # [*, N_res, N_res, C, C]
-        outer = torch.einsum("...bac,...dae->...bdce", a, b)
-
-        # [*, N_res, N_res, C * C]
-        outer = outer.reshape(outer.shape[:-2] + (-1,))
-
-        # [*, N_res, N_res, C_z]
-        outer = self.linear_out(outer)
-
-        return outer
-
-    @torch.jit.ignore
-    def _chunk(self, 
-        a: torch.Tensor, 
-        b: torch.Tensor, 
-        chunk_size: int
-    ) -> torch.Tensor:
-        # Since the "batch dim" in this case is not a true batch dimension
-        # (in that the shape of the output depends on it), we need to
-        # iterate over it ourselves
-        a_reshape = a.reshape((-1,) + a.shape[-3:])
-        b_reshape = b.reshape((-1,) + b.shape[-3:])
-        out = []
-        for a_prime, b_prime in zip(a_reshape, b_reshape):
-            outer = chunk_layer(
-                partial(self._opm, b=b_prime),
-                {"a": a_prime},
-                chunk_size=chunk_size,
-                no_batch_dims=1,
-            )
-            out.append(outer)
-        outer = torch.stack(out, dim=0)
-        outer = outer.reshape(a.shape[:-3] + outer.shape[1:])
-
-        return outer
-
-    def forward(self, 
-        m: torch.Tensor, 
-        mask: Optional[torch.Tensor] = None,
-        chunk_size: Optional[int] = None
-    ) -> torch.Tensor:
-        """
-        Args:
-            m:
-                [*, N_seq, N_res, C_m] MSA embedding
-            mask:
-                [*, N_seq, N_res] MSA mask
-        Returns:
-            [*, N_res, N_res, C_z] pair embedding update
-        """
-        if mask is None:
-            mask = m.new_ones(m.shape[:-1])
-
-        # [*, N_seq, N_res, C_m]
-        m = self.layer_norm(m)
-
-        # [*, N_seq, N_res, C]
-        mask = mask.unsqueeze(-1)
-        a = self.linear_1(m) * mask
-        b = self.linear_2(m) * mask
-
-        a = a.transpose(-2, -3)
-        b = b.transpose(-2, -3)
-
-        if chunk_size is not None:
-            outer = self._chunk(a, b, chunk_size)
-        else:
-            outer = self._opm(a, b)
-
-        # [*, N_res, N_res, 1]
-        norm = torch.einsum("...abc,...adc->...bdc", mask, mask)
-
-        # [*, N_res, N_res, C_z]
-        outer = outer / (self.eps + norm)
-
-        return outer
diff --git a/tests/test_autochunk/openfold/pair_transition.py b/tests/test_autochunk/openfold/pair_transition.py
deleted file mode 100644
index 7d09914dc3cc..000000000000
--- a/tests/test_autochunk/openfold/pair_transition.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright 2021 AlQuraishi Laboratory
-# Copyright 2021 DeepMind Technologies Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Optional
-
-import torch
-import torch.nn as nn
-
-from .primitives import Linear, LayerNorm
-from .tensor_utils import chunk_layer
-
-
-class PairTransition(nn.Module):
-    """
-    Implements Algorithm 15.
-    """
-
-    def __init__(self, c_z, n):
-        """
-        Args:
-            c_z:
-                Pair transition channel dimension
-            n:
-                Factor by which c_z is multiplied to obtain hidden channel
-                dimension
-        """
-        super(PairTransition, self).__init__()
-
-        self.c_z = c_z
-        self.n = n
-
-        self.layer_norm = LayerNorm(self.c_z)
-        self.linear_1 = Linear(self.c_z, self.n * self.c_z, init="relu")
-        self.relu = nn.ReLU()
-        self.linear_2 = Linear(self.n * self.c_z, c_z, init="final")
-
-    def _transition(self, z, mask):
-        # [*, N_res, N_res, C_hidden]
-        z = self.linear_1(z)
-        z = self.relu(z)
-
-        # [*, N_res, N_res, C_z]
-        z = self.linear_2(z) * mask
-
-        return z
-
-    @torch.jit.ignore
-    def _chunk(self,
-        z: torch.Tensor,
-        mask: torch.Tensor,
-        chunk_size: int,
-    ) -> torch.Tensor:
-        return chunk_layer(
-            self._transition,
-            {"z": z, "mask": mask},
-            chunk_size=chunk_size,
-            no_batch_dims=len(z.shape[:-2]),
-        )
-
-
-    def forward(self, 
-        z: torch.Tensor, 
-        mask: Optional[torch.Tensor] = None,
-        chunk_size: Optional[int] = None,
-    ) -> torch.Tensor:
-        """
-        Args:
-            z:
-                [*, N_res, N_res, C_z] pair embedding
-        Returns:
-            [*, N_res, N_res, C_z] pair embedding update
-        """
-        # DISCREPANCY: DeepMind forgets to apply the mask in this module.
-        if mask is None:
-            mask = z.new_ones(z.shape[:-1])
-
-        # [*, N_res, N_res, 1]
-        mask = mask.unsqueeze(-1)
-
-        # [*, N_res, N_res, C_z]
-        z = self.layer_norm(z)
-
-        if chunk_size is not None:
-            z = self._chunk(z, mask, chunk_size)
-        else:
-            z = self._transition(z=z, mask=mask)
-
-        return z
diff --git a/tests/test_autochunk/openfold/primitives.py b/tests/test_autochunk/openfold/primitives.py
deleted file mode 100644
index 32a9d487c441..000000000000
--- a/tests/test_autochunk/openfold/primitives.py
+++ /dev/null
@@ -1,529 +0,0 @@
-# Copyright 2021 AlQuraishi Laboratory
-# Copyright 2021 DeepMind Technologies Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from functools import partial
-import math
-from typing import Optional, Callable, List, Tuple, Sequence
-import numpy as np
-
-import torch
-import torch.nn as nn
-
-from .checkpointing import get_checkpoint_fn
-from .tensor_utils import (
-    permute_final_dims,
-    flatten_final_dims,
-    _chunk_slice,
-)
-
-
-def _prod(nums):
-    out = 1
-    for n in nums:
-        out = out * n
-    return out
-
-
-def _calculate_fan(linear_weight_shape, fan="fan_in"):
-    fan_out, fan_in = linear_weight_shape
-
-    if fan == "fan_in":
-        f = fan_in
-    elif fan == "fan_out":
-        f = fan_out
-    elif fan == "fan_avg":
-        f = (fan_in + fan_out) / 2
-    else:
-        raise ValueError("Invalid fan option")
-
-    return f
-
-
-def glorot_uniform_init_(weights):
-    nn.init.xavier_uniform_(weights, gain=1)
-
-
-def final_init_(weights):
-    with torch.no_grad():
-        weights.fill_(0.0)
-
-
-def gating_init_(weights):
-    with torch.no_grad():
-        weights.fill_(0.0)
-
-
-def normal_init_(weights):
-    torch.nn.init.kaiming_normal_(weights, nonlinearity="linear")
-
-
-def ipa_point_weights_init_(weights):
-    with torch.no_grad():
-        softplus_inverse_1 = 0.541324854612918
-        weights.fill_(softplus_inverse_1)
-
-
-class Linear(nn.Linear):
-    """
-    A Linear layer with built-in nonstandard initializations. Called just
-    like torch.nn.Linear.
-
-    Implements the initializers in 1.11.4, plus some additional ones found
-    in the code.
-    """
-
-    def __init__(
-        self,
-        in_dim: int,
-        out_dim: int,
-        bias: bool = True,
-        init: str = "default",
-        init_fn: Optional[Callable[[torch.Tensor, torch.Tensor], None]] = None,
-    ):
-        """
-        Args:
-            in_dim:
-                The final dimension of inputs to the layer
-            out_dim:
-                The final dimension of layer outputs
-            bias:
-                Whether to learn an additive bias. True by default
-            init:
-                The initializer to use. Choose from:
-
-                "default": LeCun fan-in truncated normal initialization
-                "relu": He initialization w/ truncated normal distribution
-                "glorot": Fan-average Glorot uniform initialization
-                "gating": Weights=0, Bias=1
-                "normal": Normal initialization with std=1/sqrt(fan_in)
-                "final": Weights=0, Bias=0
-
-                Overridden by init_fn if the latter is not None.
-            init_fn:
-                A custom initializer taking weight and bias as inputs.
-                Overrides init if not None.
-        """
-        super(Linear, self).__init__(in_dim, out_dim, bias=bias)
-
-        if bias:
-            with torch.no_grad():
-                self.bias.fill_(0)
-
-        if init_fn is not None:
-            init_fn(self.weight, self.bias)
-        else:
-            if init == "default":
-                normal_init_(self.weight)
-            elif init == "relu":
-                normal_init_(self.weight)
-            elif init == "glorot":
-                glorot_uniform_init_(self.weight)
-            elif init == "gating":
-                gating_init_(self.weight)
-                if bias:
-                    with torch.no_grad():
-                        self.bias.fill_(1.0)
-            elif init == "normal":
-                normal_init_(self.weight)
-            elif init == "final":
-                final_init_(self.weight)
-            else:
-                raise ValueError("Invalid init string.")
-
-
-class LayerNorm(nn.Module):
-
-    def __init__(self, c_in, eps=1e-5):
-        super(LayerNorm, self).__init__()
-
-        self.c_in = (c_in,)
-        self.eps = eps
-
-        self.weight = nn.Parameter(torch.ones(c_in))
-        self.bias = nn.Parameter(torch.zeros(c_in))
-
-    def forward(self, x):
-        out = nn.functional.layer_norm(
-            x,
-            self.c_in,
-            self.weight,
-            self.bias,
-            self.eps,
-        )
-
-        return out
-
-
-@torch.jit.ignore
-def softmax(t: torch.Tensor, dim: int = -1) -> torch.Tensor:
-    """
-        Softmax, but without automatic casting to fp32 when the input is of
-        type bfloat16
-    """
-    s = torch.nn.functional.softmax(t, dim=dim)
-
-    return s
-
-
-#@torch.jit.script
-def _attention(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
-               biases: List[torch.Tensor]) -> torch.Tensor:
-    # [*, H, Q, C_hidden]
-    query = permute_final_dims(query, (1, 0, 2))
-
-    # [*, H, C_hidden, K]
-    key = permute_final_dims(key, (1, 2, 0))
-
-    # [*, H, V, C_hidden]
-    value = permute_final_dims(value, (1, 0, 2))
-
-    # [*, H, Q, K]
-    a = torch.matmul(query, key)
-
-    for b in biases:
-        a += b
-
-    a = softmax(a, -1)
-
-    # [*, H, Q, C_hidden]
-    a = torch.matmul(a, value)
-
-    # [*, Q, H, C_hidden]
-    a = a.transpose(-2, -3)
-
-    return a
-
-
-@torch.jit.ignore
-def _attention_chunked_trainable(
-    query,
-    key,
-    value,
-    biases,
-    chunk_size,
-    chunk_dim,
-    checkpoint,
-):
-    if (checkpoint and len(biases) > 2):
-        raise ValueError("Checkpointed version permits only permits two bias terms")
-
-    def _checkpointable_attention(q, k, v, b1, b2):
-        bs = [b for b in [b1, b2] if b is not None]
-        return _attention(q, k, v, bs)
-
-    o_chunks = []
-    checkpoint_fn = get_checkpoint_fn()
-    count = query.shape[chunk_dim]
-    for start in range(0, count, chunk_size):
-        end = start + chunk_size
-        idx = [slice(None)] * len(query.shape)
-        idx[chunk_dim] = slice(start, end)
-        idx_tup = tuple(idx)
-        q_chunk = query[idx_tup]
-        k_chunk = key[idx_tup]
-        v_chunk = value[idx_tup]
-
-        def _slice_bias(b):
-            idx[chunk_dim] = (slice(start, end) if b.shape[chunk_dim] != 1 else slice(None))
-            return b[tuple(idx)]
-
-        if (checkpoint):
-            bias_1_chunk, bias_2_chunk = [
-                _slice_bias(b) if b is not None else None for b in (biases + [None, None])[:2]
-            ]
-
-            o_chunk = checkpoint_fn(_checkpointable_attention, q_chunk, k_chunk, v_chunk,
-                                    bias_1_chunk, bias_2_chunk)
-        else:
-            bias_chunks = [_slice_bias(b) for b in biases]
-
-            o_chunk = _attention(q_chunk, k_chunk, v_chunk, bias_chunks)
-
-        o_chunks.append(o_chunk)
-
-    o = torch.cat(o_chunks, dim=chunk_dim)
-    return o
-
-
-class Attention(nn.Module):
-    """
-    Standard multi-head attention using AlphaFold's default layer
-    initialization. Allows multiple bias vectors.
-    """
-
-    def __init__(
-        self,
-        c_q: int,
-        c_k: int,
-        c_v: int,
-        c_hidden: int,
-        no_heads: int,
-        gating: bool = True,
-    ):
-        """
-        Args:
-            c_q:
-                Input dimension of query data
-            c_k:
-                Input dimension of key data
-            c_v:
-                Input dimension of value data
-            c_hidden:
-                Per-head hidden dimension
-            no_heads:
-                Number of attention heads
-            gating:
-                Whether the output should be gated using query data
-        """
-        super(Attention, self).__init__()
-
-        self.c_q = c_q
-        self.c_k = c_k
-        self.c_v = c_v
-        self.c_hidden = c_hidden
-        self.no_heads = no_heads
-        self.gating = gating
-
-        # DISCREPANCY: c_hidden is not the per-head channel dimension, as
-        # stated in the supplement, but the overall channel dimension.
-
-        self.linear_q = Linear(self.c_q, self.c_hidden * self.no_heads, bias=False, init="glorot")
-        self.linear_k = Linear(self.c_k, self.c_hidden * self.no_heads, bias=False, init="glorot")
-        self.linear_v = Linear(self.c_v, self.c_hidden * self.no_heads, bias=False, init="glorot")
-        self.linear_o = Linear(self.c_hidden * self.no_heads, self.c_q, init="final")
-
-        self.linear_g = None
-        if self.gating:
-            self.linear_g = Linear(self.c_q, self.c_hidden * self.no_heads, init="gating")
-
-        self.sigmoid = nn.Sigmoid()
-
-    def _prep_qkv(self, q_x: torch.Tensor,
-                  kv_x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        # [*, Q/K/V, H * C_hidden]
-        q = self.linear_q(q_x)
-        k = self.linear_k(kv_x)
-        v = self.linear_v(kv_x)
-
-        # [*, Q/K, H, C_hidden]
-        q = q.view(q.shape[:-1] + (self.no_heads, -1))
-        k = k.view(k.shape[:-1] + (self.no_heads, -1))
-        v = v.view(v.shape[:-1] + (self.no_heads, -1))
-
-        q /= math.sqrt(self.c_hidden)
-
-        return q, k, v
-
-    def _wrap_up(self, o: torch.Tensor, q_x: torch.Tensor) -> torch.Tensor:
-        if (self.linear_g is not None):
-            g = self.sigmoid(self.linear_g(q_x))
-
-            # [*, Q, H, C_hidden]
-            g = g.view(g.shape[:-1] + (self.no_heads, -1))
-            o = o * g
-
-        # [*, Q, H * C_hidden]
-        o = flatten_final_dims(o, 2)
-
-        # [*, Q, C_q]
-        o = self.linear_o(o)
-
-        return o
-
-    def forward(
-        self,
-        q_x: torch.Tensor,
-        kv_x: torch.Tensor,
-        biases: Optional[List[torch.Tensor]] = None,
-        use_lma: bool = False,
-        q_chunk_size: Optional[int] = None,
-        kv_chunk_size: Optional[int] = None,
-    ) -> torch.Tensor:
-        """
-        Args:
-            q_x:
-                [*, Q, C_q] query data
-            kv_x:
-                [*, K, C_k] key data
-            biases:
-                List of biases that broadcast to [*, H, Q, K]
-            use_lma:
-                Whether to use low-memory attention
-            q_chunk_size:
-                Query chunk size (for LMA)
-            kv_chunk_size:
-                Key/Value chunk size (for LMA)
-        Returns
-            [*, Q, C_q] attention update
-        """
-        if (biases is None):
-            biases = []
-        if (use_lma and (q_chunk_size is None or kv_chunk_size is None)):
-            raise ValueError("If use_lma is specified, q_chunk_size and kv_chunk_size must "
-                             "be provided")
-
-        q, k, v = self._prep_qkv(q_x, kv_x)
-
-        if (use_lma):
-            biases = [b.expand(b.shape[:-2] + (q_x.shape[-2],) + (kv_x.shape[-2],)) for b in biases]
-
-            o = _lma(q, k, v, biases, q_chunk_size, kv_chunk_size)
-        else:
-            o = _attention(q, k, v, biases)
-
-        o = self._wrap_up(o, q_x)
-
-        return o
-
-
-class GlobalAttention(nn.Module):
-
-    def __init__(self, c_in, c_hidden, no_heads, inf, eps):
-        super(GlobalAttention, self).__init__()
-
-        self.c_in = c_in
-        self.c_hidden = c_hidden
-        self.no_heads = no_heads
-        self.inf = inf
-        self.eps = eps
-
-        self.linear_q = Linear(c_in, c_hidden * no_heads, bias=False, init="glorot")
-
-        self.linear_k = Linear(
-            c_in,
-            c_hidden,
-            bias=False,
-            init="glorot",
-        )
-        self.linear_v = Linear(
-            c_in,
-            c_hidden,
-            bias=False,
-            init="glorot",
-        )
-        self.linear_g = Linear(c_in, c_hidden * no_heads, init="gating")
-        self.linear_o = Linear(c_hidden * no_heads, c_in, init="final")
-
-        self.sigmoid = nn.Sigmoid()
-
-    def forward(self, m: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
-        # [*, N_res, C_in]
-        q = torch.sum(m * mask.unsqueeze(-1),
-                      dim=-2) / (torch.sum(mask, dim=-1)[..., None] + self.eps)
-
-        # [*, N_res, H * C_hidden]
-        q = self.linear_q(q)
-        q *= (self.c_hidden**(-0.5))
-
-        # [*, N_res, H, C_hidden]
-        q = q.view(q.shape[:-1] + (self.no_heads, -1))
-
-        # [*, N_res, N_seq, C_hidden]
-        k = self.linear_k(m)
-        v = self.linear_v(m)
-
-        # [*, N_res, H, N_seq]
-        a = torch.matmul(
-            q,
-            k.transpose(-1, -2),  # [*, N_res, C_hidden, N_seq]
-        )
-        bias = (self.inf * (mask - 1))[..., :, None, :]
-        a += bias
-        a = softmax(a)
-
-        # [*, N_res, H, C_hidden]
-        o = torch.matmul(
-            a,
-            v,
-        )
-
-        # [*, N_res, N_seq, C_hidden]
-        g = self.sigmoid(self.linear_g(m))
-
-        # [*, N_res, N_seq, H, C_hidden]
-        g = g.view(g.shape[:-1] + (self.no_heads, -1))
-
-        # [*, N_res, N_seq, H, C_hidden]
-        o = o.unsqueeze(-3) * g
-
-        # [*, N_res, N_seq, H * C_hidden]
-        o = o.reshape(o.shape[:-2] + (-1,))
-
-        # [*, N_res, N_seq, C_in]
-        m = self.linear_o(o)
-
-        return m
-
-
-def _lma(
-    q: torch.Tensor,
-    k: torch.Tensor,
-    v: torch.Tensor,
-    biases: List[torch.Tensor],
-    q_chunk_size: int,
-    kv_chunk_size: int,
-):
-    no_q, no_kv = q.shape[-3], k.shape[-3]
-
-    # [*, Q, H, C_hidden]
-    o = q.new_zeros(q.shape)
-    for q_s in range(0, no_q, q_chunk_size):
-        q_chunk = q[..., q_s:q_s + q_chunk_size, :, :]
-        large_bias_chunks = [b[..., q_s:q_s + q_chunk_size, :] for b in biases]
-
-        maxes = []
-        weights = []
-        values = []
-        for kv_s in range(0, no_kv, kv_chunk_size):
-            k_chunk = k[..., kv_s:kv_s + kv_chunk_size, :, :]
-            v_chunk = v[..., kv_s:kv_s + kv_chunk_size, :, :]
-            small_bias_chunks = [b[..., kv_s:kv_s + kv_chunk_size] for b in large_bias_chunks]
-
-            a = torch.einsum(
-                "...qhd,...khd->...hqk",
-                q_chunk,
-                k_chunk,
-            )
-
-            for b in small_bias_chunks:
-                a += b
-
-            a = a.transpose(-2, -3)
-
-            max_a = torch.max(a, dim=-1, keepdim=True)[0]
-            exp_a = torch.exp(a - max_a)
-            exp_v = torch.einsum("...vhf,...qhv->...qhf", v_chunk, exp_a)
-
-            maxes.append(max_a.detach().squeeze(-1))
-            weights.append(torch.sum(exp_a, dim=-1))
-            values.append(exp_v)
-
-        chunk_max = torch.stack(maxes, dim=-3)
-        chunk_weights = torch.stack(weights, dim=-3)
-        chunk_values = torch.stack(values, dim=-4)
-
-        global_max = torch.max(chunk_max, dim=-3, keepdim=True)[0]
-        max_diffs = torch.exp(chunk_max - global_max)
-        chunk_values *= max_diffs.unsqueeze(-1)
-        chunk_weights *= max_diffs
-
-        all_values = torch.sum(chunk_values, dim=-4)
-        all_weights = torch.sum(chunk_weights.unsqueeze(-1), dim=-4)
-
-        q_chunk_out = all_values / all_weights
-
-        o[..., q_s:q_s + q_chunk_size, :, :] = q_chunk_out
-
-    return o
diff --git a/tests/test_autochunk/openfold/tensor_utils.py b/tests/test_autochunk/openfold/tensor_utils.py
deleted file mode 100644
index 384a71fb5ffd..000000000000
--- a/tests/test_autochunk/openfold/tensor_utils.py
+++ /dev/null
@@ -1,408 +0,0 @@
-# Copyright 2021 AlQuraishi Laboratory
-# Copyright 2021 DeepMind Technologies Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from functools import partial
-import torch
-import torch.nn as nn
-from typing import Tuple, List, Callable, Any, Dict, Sequence, Optional
-
-
-def permute_final_dims(tensor: torch.Tensor, inds: List[int]):
-    zero_index = -1 * len(inds)
-    first_inds = list(range(len(tensor.shape[:zero_index])))
-    return tensor.permute(first_inds + [zero_index + i for i in inds])
-
-
-def flatten_final_dims(t: torch.Tensor, no_dims: int):
-    return t.reshape(t.shape[:-no_dims] + (-1,))
-
-
-def masked_mean(mask, value, dim, eps=1e-4):
-    mask = mask.expand(*value.shape)
-    return torch.sum(mask * value, dim=dim) / (eps + torch.sum(mask, dim=dim))
-
-
-def pts_to_distogram(pts, min_bin=2.3125, max_bin=21.6875, no_bins=64):
-    boundaries = torch.linspace(
-        min_bin, max_bin, no_bins - 1, device=pts.device
-    )
-    dists = torch.sqrt(
-        torch.sum((pts.unsqueeze(-2) - pts.unsqueeze(-3)) ** 2, dim=-1)
-    )
-    return torch.bucketize(dists, boundaries)
-
-
-def dict_multimap(fn, dicts):
-    first = dicts[0]
-    new_dict = {}
-    for k, v in first.items():
-        all_v = [d[k] for d in dicts]
-        if type(v) is dict:
-            new_dict[k] = dict_multimap(fn, all_v)
-        else:
-            new_dict[k] = fn(all_v)
-
-    return new_dict
-
-
-def one_hot(x, v_bins):
-    reshaped_bins = v_bins.view(((1,) * len(x.shape)) + (len(v_bins),))
-    diffs = x[..., None] - reshaped_bins
-    am = torch.argmin(torch.abs(diffs), dim=-1)
-    return nn.functional.one_hot(am, num_classes=len(v_bins)).float()
-
-
-def batched_gather(data, inds, dim=0, no_batch_dims=0):
-    ranges = []
-    for i, s in enumerate(data.shape[:no_batch_dims]):
-        r = torch.arange(s)
-        r = r.view(*(*((1,) * i), -1, *((1,) * (len(inds.shape) - i - 1))))
-        ranges.append(r)
-
-    remaining_dims = [
-        slice(None) for _ in range(len(data.shape) - no_batch_dims)
-    ]
-    remaining_dims[dim - no_batch_dims if dim >= 0 else dim] = inds
-    ranges.extend(remaining_dims)
-    return data[ranges]
-
-
-# With tree_map, a poor man's JAX tree_map
-def dict_map(fn, dic, leaf_type):
-    new_dict = {}
-    for k, v in dic.items():
-        if type(v) is dict:
-            new_dict[k] = dict_map(fn, v, leaf_type)
-        else:
-            new_dict[k] = tree_map(fn, v, leaf_type)
-
-    return new_dict
-
-
-def tree_map(fn, tree, leaf_type):
-    if isinstance(tree, dict):
-        return dict_map(fn, tree, leaf_type)
-    elif isinstance(tree, list):
-        return [tree_map(fn, x, leaf_type) for x in tree]
-    elif isinstance(tree, tuple):
-        return tuple([tree_map(fn, x, leaf_type) for x in tree])
-    elif isinstance(tree, leaf_type):
-        return fn(tree)
-    else:
-        print(type(tree))
-        raise ValueError("Not supported")
-
-
-tensor_tree_map = partial(tree_map, leaf_type=torch.Tensor)
-
-def _fetch_dims(tree):
-    shapes = []
-    tree_type = type(tree)
-    if tree_type is dict:
-        for v in tree.values():
-            shapes.extend(_fetch_dims(v))
-    elif tree_type is list or tree_type is tuple:
-        for t in tree:
-            shapes.extend(_fetch_dims(t))
-    elif tree_type is torch.Tensor:
-        shapes.append(tree.shape)
-    else:
-        raise ValueError("Not supported")
-
-    return shapes
-
-
-@torch.jit.ignore
-def _flat_idx_to_idx(
-    flat_idx: int,
-    dims: Tuple[int],
-) -> Tuple[int]:
-    idx = []
-    for d in reversed(dims):
-        idx.append(flat_idx % d)
-        flat_idx = flat_idx // d
-
-    return tuple(reversed(idx))
-
-
-@torch.jit.ignore
-def _get_minimal_slice_set(
-    start: Sequence[int],
-    end: Sequence[int],
-    dims: int,
-    start_edges: Optional[Sequence[bool]] = None,
-    end_edges: Optional[Sequence[bool]] = None,
-) -> Sequence[Tuple[int]]:
-    """ 
-        Produces an ordered sequence of tensor slices that, when used in
-        sequence on a tensor with shape dims, yields tensors that contain every
-        leaf in the contiguous range [start, end]. Care is taken to yield a 
-        short sequence of slices, and perhaps even the shortest possible (I'm 
-        pretty sure it's the latter).
-         
-        end is INCLUSIVE. 
-    """
-    # start_edges and end_edges both indicate whether, starting from any given
-    # dimension, the start/end index is at the top/bottom edge of the
-    # corresponding tensor, modeled as a tree
-    def reduce_edge_list(ll):
-        tally = 1
-        for i in range(len(ll)):
-            reversed_idx = -1 * (i + 1)
-            ll[reversed_idx] *= tally
-            tally = ll[reversed_idx]
-
-    if(start_edges is None):
-        start_edges = [s == 0 for s in start]
-        reduce_edge_list(start_edges)
-    if(end_edges is None):
-        end_edges = [e == (d - 1) for e,d in zip(end, dims)]
-        reduce_edge_list(end_edges)        
-
-    # Base cases. Either start/end are empty and we're done, or the final,
-    # one-dimensional tensor can be simply sliced
-    if(len(start) == 0):
-        return [tuple()]
-    elif(len(start) == 1):
-        return [(slice(start[0], end[0] + 1),)]
-
-    slices = []
-    path = []
- 
-    # Dimensions common to start and end can be selected directly
-    for s,e in zip(start, end):
-        if(s == e):
-            path.append(slice(s, s + 1))
-        else:
-            break
-
-    path = tuple(path)
-    divergence_idx = len(path)
-
-    # start == end, and we're done
-    if(divergence_idx == len(dims)):
-        return [tuple(path)]
-
-    def upper():
-        sdi = start[divergence_idx]
-        return [
-            path + (slice(sdi, sdi + 1),) + s for s in 
-            _get_minimal_slice_set(
-                start[divergence_idx + 1:],
-                [d - 1 for d in dims[divergence_idx + 1:]],
-                dims[divergence_idx + 1:],
-                start_edges=start_edges[divergence_idx + 1:],
-                end_edges=[1 for _ in end_edges[divergence_idx + 1:]]
-            )
-        ]
-
-    def lower():
-        edi = end[divergence_idx]
-        return [
-            path + (slice(edi, edi + 1),) + s for s in 
-            _get_minimal_slice_set(
-                [0 for _ in start[divergence_idx + 1:]],
-                end[divergence_idx + 1:],
-                dims[divergence_idx + 1:],
-                start_edges=[1 for _ in start_edges[divergence_idx + 1:]],
-                end_edges=end_edges[divergence_idx + 1:],
-            )
-        ]
-
-    # If both start and end are at the edges of the subtree rooted at
-    # divergence_idx, we can just select the whole subtree at once
-    if(start_edges[divergence_idx] and end_edges[divergence_idx]):
-        slices.append(
-            path + (slice(start[divergence_idx], end[divergence_idx] + 1),)
-        )
-    # If just start is at the edge, we can grab almost all of the subtree, 
-    # treating only the ragged bottom edge as an edge case
-    elif(start_edges[divergence_idx]):
-        slices.append(
-            path + (slice(start[divergence_idx], end[divergence_idx]),)
-        )
-        slices.extend(lower())
-    # Analogous to the previous case, but the top is ragged this time
-    elif(end_edges[divergence_idx]):
-        slices.extend(upper())
-        slices.append(
-            path + (slice(start[divergence_idx] + 1, end[divergence_idx] + 1),)
-        )
-    # If both sides of the range are ragged, we need to handle both sides
-    # separately. If there's contiguous meat in between them, we can index it
-    # in one big chunk
-    else:
-        slices.extend(upper())
-        middle_ground = end[divergence_idx] - start[divergence_idx]
-        if(middle_ground > 1):
-            slices.append(
-                path + (slice(start[divergence_idx] + 1, end[divergence_idx]),)
-            )
-        slices.extend(lower())
-
-    return [tuple(s) for s in slices]
-
-
-@torch.jit.ignore
-def _chunk_slice(
-    t: torch.Tensor,
-    flat_start: int,
-    flat_end: int,
-    no_batch_dims: int,
-) -> torch.Tensor:
-    """
-        Equivalent to
-        
-            t.reshape((-1,) + t.shape[no_batch_dims:])[flat_start:flat_end]
-
-        but without the need for the initial reshape call, which can be 
-        memory-intensive in certain situations. The only reshape operations
-        in this function are performed on sub-tensors that scale with
-        (flat_end - flat_start), the chunk size.
-    """
-
-    batch_dims = t.shape[:no_batch_dims]
-    start_idx = list(_flat_idx_to_idx(flat_start, batch_dims))
-    # _get_minimal_slice_set is inclusive
-    end_idx = list(_flat_idx_to_idx(flat_end - 1, batch_dims))
-
-    # Get an ordered list of slices to perform
-    slices = _get_minimal_slice_set(
-        start_idx,
-        end_idx,
-        batch_dims,
-    )
-
-    sliced_tensors = [t[s] for s in slices]
-
-    return torch.cat(
-        [s.view((-1,) + t.shape[no_batch_dims:]) for s in sliced_tensors]
-    )
-
-
-def chunk_layer(
-    layer: Callable,
-    inputs: Dict[str, Any],
-    chunk_size: int,
-    no_batch_dims: int,
-    low_mem: bool = False, 
-) -> Any:
-    """
-    Implements the "chunking" procedure described in section 1.11.8.
-
-    Layer outputs and inputs are assumed to be simple "pytrees,"
-    consisting only of (arbitrarily nested) lists, tuples, and dicts with
-    torch.Tensor leaves.
-
-    Args:
-        layer:
-            The layer to be applied chunk-wise
-        inputs:
-            A (non-nested) dictionary of keyworded inputs. All leaves must
-            be tensors and must share the same batch dimensions.
-        chunk_size:
-            The number of sub-batches per chunk. If multiple batch
-            dimensions are specified, a "sub-batch" is defined as a single
-            indexing of all batch dimensions simultaneously (s.t. the
-            number of sub-batches is the product of the batch dimensions).
-        no_batch_dims:
-            How many of the initial dimensions of each input tensor can
-            be considered batch dimensions.
-        low_mem:
-            Avoids flattening potentially large input tensors. Unnecessary
-            in most cases, and is ever so slightly slower than the default
-            setting.
-    Returns:
-        The reassembled output of the layer on the inputs.
-    """
-    if not (len(inputs) > 0):
-        raise ValueError("Must provide at least one input")
-
-    initial_dims = [shape[:no_batch_dims] for shape in _fetch_dims(inputs)]
-    orig_batch_dims = tuple([max(s) for s in zip(*initial_dims)])
-
-    def _prep_inputs(t):
-        # TODO: make this more memory efficient. This sucks
-        if(not low_mem):
-            if not sum(t.shape[:no_batch_dims]) == no_batch_dims:
-                t = t.expand(orig_batch_dims + t.shape[no_batch_dims:])
-            t = t.reshape(-1, *t.shape[no_batch_dims:])
-        else:
-            t = t.expand(orig_batch_dims + t.shape[no_batch_dims:])
-        return t
-
-    prepped_inputs = tensor_tree_map(_prep_inputs, inputs)
-
-    flat_batch_dim = 1
-    for d in orig_batch_dims:
-        flat_batch_dim *= d
-
-    no_chunks = flat_batch_dim // chunk_size + (
-        flat_batch_dim % chunk_size != 0
-    )
-
-    i = 0
-    out = None
-    for _ in range(no_chunks):
-        # Chunk the input
-        if(not low_mem):
-            select_chunk = (
-                lambda t: t[i : i + chunk_size] if t.shape[0] != 1 else t
-            )
-        else:
-            select_chunk = (
-                partial(
-                    _chunk_slice, 
-                    flat_start=i, 
-                    flat_end=min(flat_batch_dim, i + chunk_size), 
-                    no_batch_dims=len(orig_batch_dims)
-                )
-            )
-
-        chunks = tensor_tree_map(select_chunk, prepped_inputs)
-
-        # Run the layer on the chunk
-        output_chunk = layer(**chunks)
-
-        # Allocate space for the output
-        if out is None:
-            allocate = lambda t: t.new_zeros((flat_batch_dim,) + t.shape[1:])
-            out = tensor_tree_map(allocate, output_chunk)
-
-        # Put the chunk in its pre-allocated space
-        out_type = type(output_chunk)
-        if out_type is dict:
-            def assign(d1, d2):
-                for k, v in d1.items():
-                    if type(v) is dict:
-                        assign(v, d2[k])
-                    else:
-                        v[i : i + chunk_size] = d2[k]
-
-            assign(out, output_chunk)
-        elif out_type is tuple:
-            for x1, x2 in zip(out, output_chunk):
-                x1[i : i + chunk_size] = x2
-        elif out_type is torch.Tensor:
-            out[i : i + chunk_size] = output_chunk
-        else:
-            raise ValueError("Not supported")
-
-        i += chunk_size
-
-    reshape = lambda t: t.view(orig_batch_dims + t.shape[1:])
-    out = tensor_tree_map(reshape, out)
-
-    return out
diff --git a/tests/test_autochunk/openfold/triangular_attention.py b/tests/test_autochunk/openfold/triangular_attention.py
deleted file mode 100644
index 12d09c502daf..000000000000
--- a/tests/test_autochunk/openfold/triangular_attention.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Copyright 2021 AlQuraishi Laboratory
-# Copyright 2021 DeepMind Technologies Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from functools import partialmethod, partial
-import math
-from typing import Optional, List
-
-import torch
-import torch.nn as nn
-
-from .primitives import Linear, LayerNorm, Attention
-from .tensor_utils import (
-    chunk_layer,
-    permute_final_dims,
-    flatten_final_dims,
-)
-
-
-class TriangleAttention(nn.Module):
-    def __init__(
-        self, c_in, c_hidden, no_heads, starting, inf=1e9
-    ):
-        """
-        Args:
-            c_in:
-                Input channel dimension
-            c_hidden:
-                Overall hidden channel dimension (not per-head)
-            no_heads:
-                Number of attention heads
-        """
-        super(TriangleAttention, self).__init__()
-
-        self.c_in = c_in
-        self.c_hidden = c_hidden
-        self.no_heads = no_heads
-        self.starting = starting
-        self.inf = inf
-
-        self.layer_norm = LayerNorm(self.c_in)
-
-        self.linear = Linear(c_in, self.no_heads, bias=False, init="normal")
-
-        self.mha = Attention(
-            self.c_in, self.c_in, self.c_in, self.c_hidden, self.no_heads
-        )
-
-    @torch.jit.ignore
-    def _chunk(self,
-        x: torch.Tensor,
-        biases: List[torch.Tensor],
-        chunk_size: int,
-    ) -> torch.Tensor:
-        mha_inputs = {
-            "q_x": x,
-            "kv_x": x,
-            "biases": biases,
-        }
-        return chunk_layer(
-            partial(self.mha),
-            mha_inputs,
-            chunk_size=chunk_size,
-            no_batch_dims=len(x.shape[:-2]),
-        )
-
-    def forward(self, 
-        x: torch.Tensor, 
-        mask: Optional[torch.Tensor] = None,
-        chunk_size: Optional[int] = None
-    ) -> torch.Tensor:
-        """
-        Args:
-            x:
-                [*, I, J, C_in] input tensor (e.g. the pair representation)
-        Returns:
-            [*, I, J, C_in] output tensor
-        """
-        if mask is None:
-            # [*, I, J]
-            mask = x.new_ones(
-                x.shape[:-1],
-            )
-
-        # Shape annotations assume self.starting. Else, I and J are flipped
-        if not self.starting:
-            x = x.transpose(-2, -3)
-            mask = mask.transpose(-1, -2)
-
-        # [*, I, J, C_in]
-        x = self.layer_norm(x)
-
-        # [*, I, 1, 1, J]
-        mask_bias = (self.inf * (mask - 1))[..., :, None, None, :]
-
-        # [*, H, I, J]
-        triangle_bias = permute_final_dims(self.linear(x), (2, 0, 1))
-
-        # [*, 1, H, I, J]
-        triangle_bias = triangle_bias.unsqueeze(-4)
-
-        biases = [mask_bias, triangle_bias]
-
-        if chunk_size is not None:
-            x = self._chunk(x, biases, chunk_size)
-        else:
-            x = self.mha(q_x=x, kv_x=x, biases=biases)
-
-        if not self.starting:
-            x = x.transpose(-2, -3)
-
-        return x
-
-
-class TriangleAttentionStartingNode(TriangleAttention):
-    """
-    Implements Algorithm 13.
-    """
-
-    __init__ = partialmethod(TriangleAttention.__init__, starting=True)
-
-
-class TriangleAttentionEndingNode(TriangleAttention):
-    """
-    Implements Algorithm 14.
-    """
-
-    __init__ = partialmethod(TriangleAttention.__init__, starting=False)
diff --git a/tests/test_autochunk/openfold/triangular_multiplicative_update.py b/tests/test_autochunk/openfold/triangular_multiplicative_update.py
deleted file mode 100644
index 29f7062c3212..000000000000
--- a/tests/test_autochunk/openfold/triangular_multiplicative_update.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright 2021 AlQuraishi Laboratory
-# Copyright 2021 DeepMind Technologies Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from functools import partialmethod
-from typing import Optional
-
-import torch
-import torch.nn as nn
-
-from .primitives import Linear, LayerNorm
-from .tensor_utils import permute_final_dims
-
-
-class TriangleMultiplicativeUpdate(nn.Module):
-    """
-    Implements Algorithms 11 and 12.
-    """
-    def __init__(self, c_z, c_hidden, _outgoing=True):
-        """
-        Args:
-            c_z:
-                Input channel dimension
-            c:
-                Hidden channel dimension
-        """
-        super(TriangleMultiplicativeUpdate, self).__init__()
-        self.c_z = c_z
-        self.c_hidden = c_hidden
-        self._outgoing = _outgoing
-
-        self.linear_a_p = Linear(self.c_z, self.c_hidden)
-        self.linear_a_g = Linear(self.c_z, self.c_hidden, init="gating")
-        self.linear_b_p = Linear(self.c_z, self.c_hidden)
-        self.linear_b_g = Linear(self.c_z, self.c_hidden, init="gating")
-        self.linear_g = Linear(self.c_z, self.c_z, init="gating")
-        self.linear_z = Linear(self.c_hidden, self.c_z, init="final")
-
-        self.layer_norm_in = LayerNorm(self.c_z)
-        self.layer_norm_out = LayerNorm(self.c_hidden)
-
-        self.sigmoid = nn.Sigmoid()
-
-    def _combine_projections(self,
-        a: torch.Tensor,
-        b: torch.Tensor,
-    ) -> torch.Tensor:
-        raise NotImplementedError("This method needs to be overridden")
-
-    def forward(self, 
-        z: torch.Tensor, 
-        mask: Optional[torch.Tensor] = None
-    ) -> torch.Tensor:
-        """
-        Args:
-            x:
-                [*, N_res, N_res, C_z] input tensor
-            mask:
-                [*, N_res, N_res] input mask
-        Returns:
-            [*, N_res, N_res, C_z] output tensor
-        """
-        if mask is None:
-            mask = z.new_ones(z.shape[:-1])
-
-        mask = mask.unsqueeze(-1)
-
-        z = self.layer_norm_in(z)
-        a = self.linear_a_p(z) * self.sigmoid(self.linear_a_g(z))
-        a = a * mask
-        b = self.linear_b_p(z) * self.sigmoid(self.linear_b_g(z))
-        b = b * mask
-        x = self._combine_projections(a, b)
-        x = self.layer_norm_out(x)
-        x = self.linear_z(x)
-        g = self.sigmoid(self.linear_g(z))
-        z = x * g
-
-        return z
-
-
-class TriangleMultiplicationOutgoing(TriangleMultiplicativeUpdate):
-    """
-    Implements Algorithm 11.
-    """
-    def _combine_projections(self,
-        a: torch.Tensor,  # [*, N_i, N_k, C]
-        b: torch.Tensor,  # [*, N_j, N_k, C]
-    ):
-        # [*, C, N_i, N_j]
-        p = torch.matmul(
-            permute_final_dims(a, (2, 0, 1)),
-            permute_final_dims(b, (2, 1, 0)),
-        )
-
-        # [*, N_i, N_j, C]
-        return permute_final_dims(p, (1, 2, 0))
-
-
-class TriangleMultiplicationIncoming(TriangleMultiplicativeUpdate):
-    """
-    Implements Algorithm 12.
-    """
-    def _combine_projections(self,
-        a: torch.Tensor,  # [*, N_k, N_i, C]
-        b: torch.Tensor,  # [*, N_k, N_j, C]
-    ):
-        # [*, C, N_i, N_j]
-        p = torch.matmul(
-            permute_final_dims(a, (2, 1, 0)),
-            permute_final_dims(b, (2, 0, 1)),
-        )
-
-        # [*, N_i, N_j, C]
-        return permute_final_dims(p, (1, 2, 0))
-
diff --git a/tests/test_autochunk/test_evoformer_codegen.py b/tests/test_autochunk/test_evoformer_codegen.py
new file mode 100644
index 000000000000..1273bf2fecbf
--- /dev/null
+++ b/tests/test_autochunk/test_evoformer_codegen.py
@@ -0,0 +1,164 @@
+from functools import partial
+
+import pytest
+import torch
+import torch.fx
+import torch.multiprocessing as mp
+
+try:
+    from fastfold.model.nn.evoformer import EvoformerBlock
+    HAS_REPO = True
+except:
+    HAS_REPO = False
+
+import colossalai
+from colossalai.core import global_context as gpc
+from colossalai.fx._compatibility import is_compatible_with_meta
+from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
+from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp
+from colossalai.utils import free_port
+
+if CODEGEN_AVAILABLE and is_compatible_with_meta():
+    from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
+    from colossalai.fx.profiler import MetaTensor
+    from colossalai.fx.tracer.experimental import ColoTracer, symbolic_trace
+
+
+def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair, node_mask, pair_mask):
+    # for memory test
+    # torch.cuda.reset_peak_memory_stats()
+    # now_mem = torch.cuda.memory_allocated() / 1024**2
+    # with torch.no_grad():
+    #     node1 = node.clone()
+    #     pair1 = pair.clone()
+    #     gm(node1, pair1)
+    # new_now_mem = torch.cuda.memory_allocated() / 1024**2
+    # new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
+    # print(
+    #     "autochunk now mem:%.2f max mem:%.2f"
+    #     % (new_now_mem - now_mem, new_max_mem - now_mem)
+    # )
+
+    # test forward
+    model = model.cuda()
+    with torch.no_grad():
+        non_fx_out = model(node, pair, node_mask, pair_mask)
+        fx_out = gm(node, pair, node_mask, pair_mask)
+
+    assert torch.allclose(non_fx_out[0], fx_out[0],
+                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
+                              torch.abs(non_fx_out[0] - fx_out[0]))
+    assert torch.allclose(non_fx_out[1], fx_out[1],
+                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
+                              torch.abs(non_fx_out[1] - fx_out[1]))
+
+
+def _build_openfold():
+    model = EvoformerBlock(
+        c_m=256,
+        c_z=128,
+        c_hidden_msa_att=32,
+        c_hidden_opm=32,
+        c_hidden_mul=128,
+        c_hidden_pair_att=32,
+        no_heads_msa=8,
+        no_heads_pair=4,
+        transition_n=4,
+        msa_dropout=0.15,
+        pair_dropout=0.15,
+        inf=1e4,
+        eps=1e-4,
+        is_multimer=False,
+    ).eval().cuda()
+    return model
+
+
+def _test_evoformer_codegen(rank, msa_len, pair_len, max_memory):
+    # launch colossalai
+    colossalai.launch(
+        config={},
+        rank=rank,
+        world_size=1,
+        host="localhost",
+        port=free_port(),
+        backend="nccl",
+    )
+
+    # build model and input
+    model = _build_openfold()
+    node = torch.randn(1, msa_len, pair_len, 256).cuda()
+    node_mask = torch.randn(1, msa_len, pair_len).cuda()
+    pair = torch.randn(1, pair_len, pair_len, 128).cuda()
+    pair_mask = torch.randn(1, pair_len, pair_len).cuda()
+
+    # trace the meta graph and setup codegen
+    meta_graph = symbolic_trace(
+        model,
+        meta_args={
+            "m": node.to(torch.device("meta")),
+            "z": pair.to(torch.device("meta")),
+            "msa_mask": node_mask.to(torch.device("meta")),
+            "pair_mask": pair_mask.to(torch.device("meta")),
+        },
+        concrete_args={
+            "chunk_size": None,
+            "_mask_trans": True,
+        },
+    )
+    interp = MetaInfoProp(meta_graph)
+    interp.propagate(
+        MetaTensor(node, fake_device="cuda:0"),
+        MetaTensor(pair, fake_device="cuda:0"),
+        MetaTensor(node_mask, fake_device="cuda:0"),
+        MetaTensor(pair_mask, fake_device="cuda:0"),
+    )
+    # codegen = AutoChunkCodeGen(meta_graph, max_memory=max_memory)
+
+    # trace and recompile
+    # MetaInfoProp requires symbolic_trace but CodeGen requires ColoTracer
+    graph = ColoTracer().trace(
+        model,
+        meta_args={
+            "m": node.to(torch.device("meta")),
+            "z": pair.to(torch.device("meta")),
+            "msa_mask": node_mask.to(torch.device("meta")),
+            "pair_mask": pair_mask.to(torch.device("meta")),
+        },
+        concrete_args={
+            "chunk_size": None,
+            "_mask_trans": True,
+        },
+    )
+    # graph.set_codegen(codegen)
+    gm = ColoGraphModule(model, graph)
+    gm.recompile()
+
+    # assert we have inserted chunk
+    code = graph.python_code("self").src
+    assert "chunk_size" in code
+    # print(code)
+
+    _test_fwd(model, gm, node, pair, node_mask, pair_mask)
+    gpc.destroy()
+
+
+@pytest.mark.skipif(
+    not (CODEGEN_AVAILABLE and is_compatible_with_meta() and HAS_REPO),
+    reason="torch version is lower than 1.12.0",
+)
+@pytest.mark.parametrize("max_memory", [None, 20, 25, 30])
+@pytest.mark.parametrize("msa_len", [32])
+@pytest.mark.parametrize("pair_len", [64])
+def test_evoformer_codegen(msa_len, pair_len, max_memory):
+    run_func = partial(
+        _test_evoformer_codegen,
+        msa_len=msa_len,
+        pair_len=pair_len,
+        max_memory=max_memory,
+    )
+    mp.spawn(run_func, nprocs=1)
+
+
+if __name__ == "__main__":
+    _test_evoformer_codegen(0, 32, 64, 25)
diff --git a/tests/test_autochunk/test_autochunk_codegen.py b/tests/test_autochunk/test_simple_evoformer_codegen.py
similarity index 88%
rename from tests/test_autochunk/test_autochunk_codegen.py
rename to tests/test_autochunk/test_simple_evoformer_codegen.py
index 02fa07e2ca00..f1272330fcd9 100644
--- a/tests/test_autochunk/test_autochunk_codegen.py
+++ b/tests/test_autochunk/test_simple_evoformer_codegen.py
@@ -5,6 +5,12 @@
 import torch.fx
 import torch.multiprocessing as mp
 
+try:
+    from simple_evoformer import base_evoformer
+    HAS_REPO = True
+except:
+    HAS_REPO = False
+
 import colossalai
 from colossalai.core import global_context as gpc
 from colossalai.fx import ColoTracer
@@ -13,7 +19,6 @@
 from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 from colossalai.utils import free_port
-from tests.test_autochunk.evoformer.evoformer import evoformer_base
 
 if CODEGEN_AVAILABLE and is_compatible_with_meta():
     from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
@@ -48,7 +53,7 @@ def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
                               torch.abs(non_fx_out[1] - fx_out[1]))
 
 
-def _test_autochunk_codegen(rank, msa_len, pair_len, max_memory):
+def _test_simple_evoformer_codegen(rank, msa_len, pair_len, max_memory):
     # launch colossalai
     colossalai.launch(
         config={},
@@ -60,7 +65,7 @@ def _test_autochunk_codegen(rank, msa_len, pair_len, max_memory):
     )
 
     # build model and input
-    model = evoformer_base().cuda()
+    model = base_evoformer().cuda()
     node = torch.randn(1, msa_len, pair_len, 256).cuda()
     pair = torch.randn(1, pair_len, pair_len, 128).cuda()
 
@@ -95,13 +100,14 @@ def _test_autochunk_codegen(rank, msa_len, pair_len, max_memory):
     gpc.destroy()
 
 
-@pytest.mark.skipif(not (CODEGEN_AVAILABLE and is_compatible_with_meta()), reason='torch version is lower than 1.12.0')
+@pytest.mark.skipif(not (CODEGEN_AVAILABLE and is_compatible_with_meta() and HAS_REPO),
+                    reason='torch version is lower than 1.12.0')
 @pytest.mark.parametrize("max_memory", [None, 20, 25, 30])
 @pytest.mark.parametrize("msa_len", [32])
 @pytest.mark.parametrize("pair_len", [64])
-def test_autochunk_codegen(msa_len, pair_len, max_memory):
+def test_simple_evoformer_codegen(msa_len, pair_len, max_memory):
     run_func = partial(
-        _test_autochunk_codegen,
+        _test_simple_evoformer_codegen,
         msa_len=msa_len,
         pair_len=pair_len,
         max_memory=max_memory,
@@ -110,4 +116,4 @@ def test_autochunk_codegen(msa_len, pair_len, max_memory):
 
 
 if __name__ == "__main__":
-    _test_autochunk_codegen(0, 32, 64, 25)
+    _test_simple_evoformer_codegen(0, 32, 64, 25)
diff --git a/tests/test_autochunk/test_autochunk_search.py b/tests/test_autochunk/test_simple_evoformer_search.py
similarity index 87%
rename from tests/test_autochunk/test_autochunk_search.py
rename to tests/test_autochunk/test_simple_evoformer_search.py
index 371fce64fdf7..04fb514fbf44 100644
--- a/tests/test_autochunk/test_autochunk_search.py
+++ b/tests/test_autochunk/test_simple_evoformer_search.py
@@ -5,13 +5,18 @@
 import torch.fx
 import torch.multiprocessing as mp
 
+try:
+    from simple_evoformer import base_evoformer
+    HAS_REPO = True
+except:
+    HAS_REPO = False
+
 import colossalai
 from colossalai.core import global_context as gpc
 from colossalai.fx._compatibility import is_compatible_with_meta
 from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
 from colossalai.utils import free_port
-from tests.test_autochunk.evoformer.evoformer import evoformer_base
 
 if CODEGEN_AVAILABLE and is_compatible_with_meta():
     from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
@@ -57,7 +62,7 @@ def assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len):
         )
 
 
-def _test_autochunk_search(rank, msa_len, pair_len, max_memory):
+def _test_simple_evoformer_search(rank, msa_len, pair_len, max_memory):
     # launch colossalai
     colossalai.launch(
         config={},
@@ -69,7 +74,7 @@ def _test_autochunk_search(rank, msa_len, pair_len, max_memory):
     )
 
     # build model and input
-    model = evoformer_base().cuda()
+    model = base_evoformer().cuda()
     node = torch.randn(1, msa_len, pair_len, 256).cuda()
     pair = torch.randn(1, pair_len, pair_len, 128).cuda()
 
@@ -84,13 +89,14 @@ def _test_autochunk_search(rank, msa_len, pair_len, max_memory):
     gpc.destroy()
 
 
-@pytest.mark.skipif(not (CODEGEN_AVAILABLE and is_compatible_with_meta()), reason="torch version is lower than 1.12.0")
+@pytest.mark.skipif(not (CODEGEN_AVAILABLE and is_compatible_with_meta() and HAS_REPO),
+                    reason="torch version is lower than 1.12.0")
 @pytest.mark.parametrize("max_memory", [None, 20, 25, 30])
 @pytest.mark.parametrize("msa_len", [32])
 @pytest.mark.parametrize("pair_len", [64])
-def test_autochunk_search(msa_len, pair_len, max_memory):
+def test_simple_evoformer_search(msa_len, pair_len, max_memory):
     run_func = partial(
-        _test_autochunk_search,
+        _test_simple_evoformer_search,
         msa_len=msa_len,
         pair_len=pair_len,
         max_memory=max_memory,
@@ -99,4 +105,4 @@ def test_autochunk_search(msa_len, pair_len, max_memory):
 
 
 if __name__ == "__main__":
-    _test_autochunk_search(0, 32, 64, 20)
+    _test_simple_evoformer_search(0, 32, 64, 20)

From fcc6d61d925a6ad3e95e8d71e8f16361595a725f Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Tue, 17 Jan 2023 13:07:25 +0800
Subject: [PATCH 193/209] [example] fix requirements (#2488)

---
 .../language/gpt/experiments/pipeline_parallel/requirements.txt | 2 ++
 examples/language/gpt/gemini/requirements.txt                   | 2 ++
 examples/language/gpt/requirements.txt                          | 1 +
 examples/language/opt/requirements.txt                          | 2 ++
 4 files changed, 7 insertions(+)
 create mode 100644 examples/language/gpt/experiments/pipeline_parallel/requirements.txt
 create mode 100644 examples/language/gpt/gemini/requirements.txt
 create mode 100644 examples/language/opt/requirements.txt

diff --git a/examples/language/gpt/experiments/pipeline_parallel/requirements.txt b/examples/language/gpt/experiments/pipeline_parallel/requirements.txt
new file mode 100644
index 000000000000..137a69e80498
--- /dev/null
+++ b/examples/language/gpt/experiments/pipeline_parallel/requirements.txt
@@ -0,0 +1,2 @@
+colossalai >= 0.1.12
+torch >= 1.8.1
diff --git a/examples/language/gpt/gemini/requirements.txt b/examples/language/gpt/gemini/requirements.txt
new file mode 100644
index 000000000000..137a69e80498
--- /dev/null
+++ b/examples/language/gpt/gemini/requirements.txt
@@ -0,0 +1,2 @@
+colossalai >= 0.1.12
+torch >= 1.8.1
diff --git a/examples/language/gpt/requirements.txt b/examples/language/gpt/requirements.txt
index e1f131468fb8..ef58bb76bfc8 100644
--- a/examples/language/gpt/requirements.txt
+++ b/examples/language/gpt/requirements.txt
@@ -1 +1,2 @@
 transformers >= 4.23
+colossalai
diff --git a/examples/language/opt/requirements.txt b/examples/language/opt/requirements.txt
new file mode 100644
index 000000000000..137a69e80498
--- /dev/null
+++ b/examples/language/opt/requirements.txt
@@ -0,0 +1,2 @@
+colossalai >= 0.1.12
+torch >= 1.8.1

From d565a248495b49e78c10d8f3b74de2f8abb63ece Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Wed, 18 Jan 2023 10:36:10 +0800
Subject: [PATCH 194/209] [zero] add unit testings for hybrid parallelism 
 (#2486)

---
 .../sharded_optim/bookkeeping/bucket_store.py |  12 +-
 .../zero/sharded_optim/low_level_optim.py     | 159 ++++++++----------
 tests/test_tensor/common_utils/_utils.py      |  17 +-
 .../test_zero/low_level_zero/test_zero_tp.py  |  98 +++++++++++
 4 files changed, 188 insertions(+), 98 deletions(-)
 create mode 100644 tests/test_zero/low_level_zero/test_zero_tp.py

diff --git a/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py b/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py
index 9e0c05d8941a..ec322a78bf81 100644
--- a/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py
+++ b/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py
@@ -7,7 +7,6 @@ class BucketStore(BaseStore):
 
     def __init__(self, torch_pg: ProcessGroup):
         super().__init__(torch_pg)
-        self._grads = dict()
         self._params = dict()
         self._num_elements_in_bucket = dict()
 
@@ -19,25 +18,24 @@ def num_elements_in_bucket(self, reduce_rank: int = None):
     def add_num_elements_in_bucket(self, num_elements, reduce_rank: int = None):
         self._num_elements_in_bucket[reduce_rank] += num_elements
 
-    def add_grad(self, tensor, reduce_rank: int = None):
-        self._grads[reduce_rank].append(tensor)
-
     def add_param(self, tensor, reduce_rank: int = None):
         self._params[reduce_rank].append(tensor)
 
     def reset(self):
         keys = [None] + list(range(self._world_size))
-        self._grads = {rank: [] for rank in keys}
         self._params = {rank: [] for rank in keys}
         self._num_elements_in_bucket = {rank: 0 for rank in keys}
 
     def reset_by_rank(self, reduce_rank=None):
-        self._grads[reduce_rank] = []
         self._params[reduce_rank] = []
         self._num_elements_in_bucket[reduce_rank] = 0
 
     def get_grad(self, reduce_rank: int = None):
-        return self._grads[reduce_rank]
+        param_list = self.get_param(reduce_rank)
+        for param in param_list:
+            # the param must have grad for reduction
+            assert param.grad is not None, f'Parameter of size ({param.size()}) has None grad, cannot be reduced'
+        return [param.grad for param in param_list]
 
     def get_param(self, reduce_rank: int = None):
         return self._params[reduce_rank]
diff --git a/colossalai/zero/sharded_optim/low_level_optim.py b/colossalai/zero/sharded_optim/low_level_optim.py
index 38736d01afef..f45b5e200a61 100644
--- a/colossalai/zero/sharded_optim/low_level_optim.py
+++ b/colossalai/zero/sharded_optim/low_level_optim.py
@@ -46,7 +46,7 @@ def __init__(
             reduce_bucket_size: int = 1024 * 1024,    # communication
             communication_dtype: Optional[torch.dtype] = None,
             overlap_communication: bool = False,
-            partition_grad: bool = False,    # stage 2
+            partition_grad: bool = False,    # stage 2 flag
             cpu_offload: bool = False,    # cpu offload
             forced_dtype: Optional[torch.dtype] = None):
 
@@ -248,9 +248,13 @@ def _partition_param_list(self, param_list):
             self._logger.info(f'Number of elements on ranks: {numel_per_rank}', ranks=[0])
         return params_per_rank
 
-    ###########################################################
-    # Backward Reduction Hook
-    ###########################################################
+    ###########################
+    # Backward Reduction Hook #
+    ###########################
+
+    def _grad_handler(self, param, grad, reduce_rank):
+        self._add_to_reduction_bucket(param, reduce_rank)
+        return grad
 
     def _attach_reduction_hook(self):
         # we iterate over the fp16 params
@@ -268,53 +272,61 @@ def _attach_reduction_hook(self):
                     else:
                         reduce_rank = None
 
-                    def _define_and_attach(param, reduce_rank):
-                        # get the AccumulateGrad object of the param itself
-                        accum_grad_obj = get_grad_accumulate_object(param)
-                        self._grad_store.add_accumulate_grad_object(accum_grad_obj)
+                    param.register_hook(partial(self._grad_handler, param, reduce_rank=reduce_rank))
 
-                        reduction_func = partial(self._reduce_and_remove_grads_by_bucket,
-                                                 param=param,
-                                                 reduce_rank=reduce_rank)
+    def _reduce_tensor_bucket(self, bucket: TensorBucket, reduce_rank):
+        if self._overlap_communication:
+            torch.cuda.synchronize()
+            self._param_store.clear_grads_of_previous_reduced_params()
+            stream = self._comm_stream
+        else:
+            stream = torch.cuda.current_stream()
 
-                        # define hook
-                        # NOT IMPORTANT BUT GOOD TO KNOW:
-                        # args here is not grad, but allow_unreacable and accumulate_grad
-                        def reduce_grad_hook(*args):
-                            reduction_func()
+        with torch.cuda.stream(stream):
+            flat = bucket.flatten()
+            reduce_global_rank = None
+            if reduce_rank is not None:
+                reduce_global_rank = self._dp_global_ranks[reduce_rank]
+            reduced_flat = reduce_tensor_dp_group(tensor=flat,
+                                                  dtype=self._communication_dtype,
+                                                  dst_local_rank=reduce_rank,
+                                                  dst_global_rank=reduce_global_rank,
+                                                  group=self._dp_torch_group)
 
-                        accum_grad_obj.register_hook(reduce_grad_hook)
+            # update the reduced tensor
+            if reduce_rank is None or reduce_rank == self._local_rank:
+                bucket.unflatten_and_copy(reduced_flat)
 
-                    _define_and_attach(param, reduce_rank)
+    def _reduce_tensor_list_with_one_dtype(self, tensor_list, bucket_size, reduce_rank):
+        param_bucket = TensorBucket(size=bucket_size)
 
-    def _reduce_and_remove_grads_by_bucket(self, param, reduce_rank=None):
-        param_size = param.numel()
+        for tensor in tensor_list:
+            param_bucket.add_to_bucket(tensor, allow_oversize=True)
 
-        # check if the bucket is full
-        # if full, will reduce the grads already in the bucket
-        # after reduction, the bucket will be empty
-        if self._bucket_store.num_elements_in_bucket(reduce_rank) + param_size > self._reduce_bucket_size:
-            self._reduce_grads_in_bucket(reduce_rank)
+            if param_bucket.is_full_or_oversized():
+                self._reduce_tensor_bucket(bucket=param_bucket, reduce_rank=reduce_rank)
+                param_bucket.empty()
 
-        # the param must not be reduced to ensure correctness
-        is_param_reduced = self._param_store.is_param_reduced(param)
-        if is_param_reduced:
-            msg = f'Parameter of size ({param.size()}) has already been reduced, ' \
-                  + 'duplicate reduction will lead to arithmetic incorrectness'
-            raise RuntimeError(msg)
+        if not param_bucket.is_empty():
+            self._reduce_tensor_bucket(bucket=param_bucket, reduce_rank=reduce_rank)
 
-        # the param must have grad for reduction
-        assert param.grad is not None, f'Parameter of size ({param.size()}) has None grad, cannot be reduced'
+    def _reduce_grads(self, reduce_rank, grads, bucket_size):
+        grad_buckets_by_dtype = split_half_float_double(grads)
 
-        self._bucket_store.add_num_elements_in_bucket(param_size, reduce_rank)
-        self._bucket_store.add_grad(param.grad, reduce_rank)
-        self._bucket_store.add_param(param, reduce_rank)
+        for tensor_list in grad_buckets_by_dtype:
+            self._reduce_tensor_list_with_one_dtype(tensor_list=tensor_list,
+                                                    bucket_size=bucket_size,
+                                                    reduce_rank=reduce_rank)
+
+    #######################
+    # Reduction Functions #
+    #######################
 
-    def _reduce_grads_in_bucket(self, reduce_rank=None):
+    def _run_reduction(self, reduce_rank=None):
         # reduce grads
-        self._reduce_grads_by_rank(reduce_rank=reduce_rank,
-                                   grads=self._bucket_store.get_grad(reduce_rank=reduce_rank),
-                                   bucket_size=self._bucket_store.num_elements_in_bucket(reduce_rank))
+        self._reduce_grads(reduce_rank=reduce_rank,
+                           grads=self._bucket_store.get_grad(reduce_rank=reduce_rank),
+                           bucket_size=self._bucket_store.num_elements_in_bucket(reduce_rank))
 
         # use communication stream if overlapping
         # communication with computation
@@ -351,50 +363,24 @@ def _reduce_grads_in_bucket(self, reduce_rank=None):
 
         self._bucket_store.reset_by_rank(reduce_rank)
 
-    def _reduce_grads_by_rank(self, reduce_rank, grads, bucket_size):
-        grad_buckets_by_dtype = split_half_float_double(grads)
-
-        for tensor_list in grad_buckets_by_dtype:
-            self._reduce_no_retain(tensor_list=tensor_list, bucket_size=bucket_size, reduce_rank=reduce_rank)
-
-    ##############################
-    # Reduction Utility Function #
-    ##############################
-    def _reduce_no_retain(self, tensor_list, bucket_size, reduce_rank):
-        param_bucket = TensorBucket(size=bucket_size)
-
-        for tensor in tensor_list:
-            param_bucket.add_to_bucket(tensor, allow_oversize=True)
-
-            if param_bucket.is_full_or_oversized():
-                self._reduce_and_copy(bucket=param_bucket, reduce_rank=reduce_rank)
-                param_bucket.empty()
-
-        if not param_bucket.is_empty():
-            self._reduce_and_copy(bucket=param_bucket, reduce_rank=reduce_rank)
+    def _add_to_reduction_bucket(self, param, reduce_rank=None):
+        param_size = param.numel()
 
-    def _reduce_and_copy(self, bucket: TensorBucket, reduce_rank):
-        if self._overlap_communication:
-            torch.cuda.synchronize()
-            self._param_store.clear_grads_of_previous_reduced_params()
-            stream = self._comm_stream
-        else:
-            stream = torch.cuda.current_stream()
+        # check if the bucket is full
+        # if full, will reduce the grads already in the bucket
+        # after reduction, the bucket will be empty
+        if self._bucket_store.num_elements_in_bucket(reduce_rank) + param_size > self._reduce_bucket_size:
+            self._run_reduction(reduce_rank)
 
-        with torch.cuda.stream(stream):
-            flat = bucket.flatten()
-            reduce_global_rank = None
-            if reduce_rank is not None:
-                reduce_global_rank = self._dp_global_ranks[reduce_rank]
-            reduced_flat = reduce_tensor_dp_group(tensor=flat,
-                                                  dtype=self._communication_dtype,
-                                                  dst_local_rank=reduce_rank,
-                                                  dst_global_rank=reduce_global_rank,
-                                                  group=self._dp_torch_group)
+        # the param must not be reduced to ensure correctness
+        is_param_reduced = self._param_store.is_param_reduced(param)
+        if is_param_reduced:
+            msg = f'Parameter of size ({param.size()}) has already been reduced, ' \
+                  + 'duplicate reduction will lead to arithmetic incorrectness'
+            raise RuntimeError(msg)
 
-            # update the reduced tensor
-            if reduce_rank is None or reduce_rank == self._local_rank:
-                bucket.unflatten_and_copy(reduced_flat)
+        self._bucket_store.add_num_elements_in_bucket(param_size, reduce_rank)
+        self._bucket_store.add_param(param, reduce_rank)
 
     ################################
     # torch.optim.Optimizer methods
@@ -498,8 +484,9 @@ def step(self, closure=None):
         # broadcast the updated model weights
         handles = []
         for group_id in range(self.num_param_groups):
-            for rank in range(self._world_size):
-                fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(rank=rank, group_id=group_id)
+            for index in range(self._world_size):
+                rank = self._dp_global_ranks[index]
+                fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(rank=index, group_id=group_id)
                 handle = dist.broadcast(fp16_param, src=rank, group=self._dp_torch_group, async_op=True)
                 handles.append(handle)
 
@@ -585,11 +572,11 @@ def _reduce_grad_stage1(self):
                 param_group = self._fp16_param_groups[group_id]
                 for param in param_group:
                     if param.grad is not None:
-                        self._reduce_and_remove_grads_by_bucket(param)
+                        self._add_to_reduction_bucket(param)
 
         # we need to reduce the gradients
         # left in the communication bucket
-        self._reduce_grads_in_bucket()
+        self._run_reduction()
 
     def _reduce_grad_stage2(self):
         # when partition_grads is True, reduction hooks
@@ -597,4 +584,4 @@ def _reduce_grad_stage2(self):
         # only need to reduce the gradients
         # left in the communication bucket
         for reduce_rank in range(self._world_size):
-            self._reduce_grads_in_bucket(reduce_rank)
+            self._run_reduction(reduce_rank)
diff --git a/tests/test_tensor/common_utils/_utils.py b/tests/test_tensor/common_utils/_utils.py
index 6b58aa801d15..b405f8cd2108 100644
--- a/tests/test_tensor/common_utils/_utils.py
+++ b/tests/test_tensor/common_utils/_utils.py
@@ -4,6 +4,7 @@
 import numpy as np
 import torch
 import torch.distributed as dist
+from torch.testing import assert_close
 
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
@@ -41,14 +42,20 @@ def broadcast_tensor_chunk(tensor, chunk_size=1, local_rank=0):
     return tensor_chunk.clone()
 
 
-def tensor_equal(A, B):
-    return torch.allclose(A, B, rtol=1e-3, atol=1e-1)
+def tensor_equal(t_a: torch.Tensor, t_b: torch.Tensor, rtol: float = 1e-3, atol: float = 1e-1):
+    assert_close(t_a, t_b, rtol=rtol, atol=atol)
+    return True
 
 
-def tensor_shard_equal(tensor: torch.Tensor, shard: torch.Tensor, rank, world_size):
+def tensor_shard_equal(tensor: torch.Tensor,
+                       shard: torch.Tensor,
+                       rank: int,
+                       world_size: int,
+                       rtol: float = 1e-3,
+                       atol: float = 1e-1):
     assert tensor.ndim == shard.ndim
     if tensor.shape == shard.shape:
-        return tensor_equal(tensor, shard)
+        return tensor_equal(tensor, shard, rtol, atol)
     else:
         dims_not_eq = torch.nonzero(torch.tensor(tensor.shape) != torch.tensor(shard.shape))
         if dims_not_eq.numel() == 1:
@@ -58,7 +65,7 @@ def tensor_shard_equal(tensor: torch.Tensor, shard: torch.Tensor, rank, world_si
                 world_size = gpc.get_world_size(ParallelMode.PARALLEL_1D)
             if rank is None:
                 rank = gpc.get_local_rank(ParallelMode.PARALLEL_1D)
-            return tensor_equal(tensor.chunk(world_size, dim)[rank], shard)
+            return tensor_equal(tensor.chunk(world_size, dim)[rank], shard, rtol, atol)
         else:
             raise NotImplementedError
 
diff --git a/tests/test_zero/low_level_zero/test_zero_tp.py b/tests/test_zero/low_level_zero/test_zero_tp.py
new file mode 100644
index 000000000000..8ba6e3cb61fd
--- /dev/null
+++ b/tests/test_zero/low_level_zero/test_zero_tp.py
@@ -0,0 +1,98 @@
+from functools import partial
+
+import pytest
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.testing import assert_close
+
+import colossalai
+from colossalai.tensor import ProcessGroup
+from colossalai.testing import parameterize, rerun_if_address_is_in_use
+from colossalai.utils import free_port, get_current_device
+from colossalai.utils.model.colo_init_context import ColoInitContext
+from colossalai.zero import LowLevelZeroOptimizer
+from tests.test_tensor.common_utils import set_seed, split_param_col_tp1d, split_param_row_tp1d, tensor_shard_equal
+
+
+def strict_shard_equal(tensor, shard, tp_pg, rtol=1e-3, atol=1e-4):
+    return tensor_shard_equal(tensor, shard, tp_pg.tp_local_rank(), tp_pg.tp_world_size(), rtol, atol)
+
+
+class TestModel(nn.Module):
+
+    def __init__(self):
+        super(TestModel, self).__init__()
+        self.linear1 = nn.Linear(32, 128)
+        self.act = nn.GELU()
+        self.linear2 = nn.Linear(128, 32)
+
+    def forward(self, x):
+        y = self.linear1(x)
+        y = self.act(y)
+        y = self.linear2(y)
+        return x + y
+
+
+@parameterize("overlap_flag", [False, True])
+@parameterize("partition_flag", [False, True])
+def exam_zero_with_tp(overlap_flag, partition_flag):
+    set_seed(233010)
+    tp_pg = ProcessGroup(tp_degree=2)
+
+    with ColoInitContext(device=get_current_device(), default_pg=tp_pg):
+        hybrid_model = TestModel()
+    torch_model = TestModel().cuda()
+    for pt, ph in zip(torch_model.parameters(), hybrid_model.parameters()):
+        pt.data.copy_(ph.data)
+
+    for name, param in hybrid_model.named_parameters():
+        if 'linear1' in name:
+            split_param_row_tp1d(param, tp_pg)
+            param.compute_spec.set_output_replicate(False)
+        if 'linear2.weight' in name:
+            split_param_col_tp1d(param, tp_pg)
+
+    torch_model = DDP(torch_model, device_ids=[tp_pg.rank()], process_group=tp_pg.dp_process_group())
+    torch_optim = torch.optim.Adam(torch_model.parameters(), lr=1)
+    hybrid_optim = torch.optim.Adam(hybrid_model.parameters(), lr=1)
+    hybrid_optim = LowLevelZeroOptimizer(hybrid_optim,
+                                         initial_scale=1,
+                                         overlap_communication=overlap_flag,
+                                         partition_grad=partition_flag)
+
+    dp_local_rank = tp_pg.dp_local_rank()
+    set_seed(255 + dp_local_rank)
+
+    data = torch.randn(8, 32, device=get_current_device())
+    torch_loss = torch_model(data).sum()
+    hybrid_loss = hybrid_model(data).sum()
+    assert_close(torch_loss, hybrid_loss)
+
+    torch_loss.backward()
+    hybrid_optim.backward(hybrid_loss)
+    hybrid_optim.sync_grad()
+
+    torch_optim.step()
+    hybrid_optim.step()
+
+    for (name, pt), ph in zip(torch_model.named_parameters(), hybrid_model.parameters()):
+        assert strict_shard_equal(pt.data, ph.data, tp_pg)
+
+
+def run_dist(rank, world_size, port):
+    colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host='localhost')
+    exam_zero_with_tp()
+
+
+@pytest.mark.dist
+@rerun_if_address_is_in_use()
+def test_zero_with_tp():
+    world_size = 4
+    run_func = partial(run_dist, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_zero_with_tp()

From a4b75b78a07254680286431963c527a482bac93c Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Wed, 18 Jan 2023 11:37:16 +0800
Subject: [PATCH 195/209] [hotfix] gpt example titans bug #2493

---
 .../titans/configs/gpt2_small_zero3_pp1d.py   |  4 +-
 .../language/gpt/titans/dataset/webtext.py    | 39 +++++++++++++++++++
 examples/language/gpt/titans/train_gpt.py     |  2 +-
 3 files changed, 42 insertions(+), 3 deletions(-)
 create mode 100644 examples/language/gpt/titans/dataset/webtext.py

diff --git a/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py b/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py
index 8ef81cb0a14f..7bf53303948a 100644
--- a/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py
+++ b/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py
@@ -12,11 +12,11 @@
 
 # if you do no want zero, just comment out this dictionary
 zero = dict(model_config=dict(tensor_placement_policy='cuda', shard_strategy=TensorShardStrategy()),
-            optimizer_config=dict(initial_scale=2**16))
+            optimizer_config=dict(initial_scale=2**5))
 
 optimizer = dict(
     type=HybridAdam,
-    lr=0.00015,
+    lr=0.000015,
     weight_decay=1e-2,
 )
 
diff --git a/examples/language/gpt/titans/dataset/webtext.py b/examples/language/gpt/titans/dataset/webtext.py
new file mode 100644
index 000000000000..09d8870b530b
--- /dev/null
+++ b/examples/language/gpt/titans/dataset/webtext.py
@@ -0,0 +1,39 @@
+import json
+import os
+
+import torch
+from torch.utils.data import Dataset
+from transformers import GPT2Tokenizer
+
+from colossalai.registry import DATASETS
+
+
+@DATASETS.register_module
+class WebtextDataset(Dataset):
+
+    def __init__(self, path, seq_len=1024) -> None:
+        super().__init__()
+        root = os.path.dirname(path)
+        encoded_data_cache_path = os.path.join(root, f'gpt_webtext_{seq_len}.pt')
+        if os.path.isfile(encoded_data_cache_path):
+            seq_len_, data, attention_mask = torch.load(encoded_data_cache_path)
+            if seq_len_ == seq_len:
+                self.data = data
+                self.attention_mask = attention_mask
+                return
+        raw_data = []
+        with open(path) as f:
+            for line in f.readlines():
+                raw_data.append(json.loads(line)['text'])
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        tokenizer.pad_token = tokenizer.unk_token
+        encoded_data = tokenizer(raw_data, padding=True, truncation=True, max_length=seq_len, return_tensors='pt')
+        self.data = encoded_data['input_ids']
+        self.attention_mask = encoded_data['attention_mask']
+        torch.save((seq_len, self.data, self.attention_mask), encoded_data_cache_path)
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        return {'input_ids': self.data[index], 'attention_mask': self.attention_mask[index]}, self.data[index]
diff --git a/examples/language/gpt/titans/train_gpt.py b/examples/language/gpt/titans/train_gpt.py
index 1380b4b3a7da..4db7a081fc17 100644
--- a/examples/language/gpt/titans/train_gpt.py
+++ b/examples/language/gpt/titans/train_gpt.py
@@ -30,7 +30,7 @@ def calc_local_model_size(model: torch.nn.Module):
 def main():
     parser = colossalai.get_default_parser()
     parser.add_argument('--from_torch', default=False, action='store_true')
-    parser.add_argument('--use_dummy_dataset', default=True, action='store_true')
+    parser.add_argument('--use_dummy_dataset', default=False, action='store_true')
     args = parser.parse_args()
     disable_existing_loggers()
     if args.from_torch:

From e58cc441e2142f53b61d2b95558974753f9a6e68 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Wed, 18 Jan 2023 12:00:08 +0800
Subject: [PATCH 196/209] polish code and fix dataloader bugs

---
 .../language/gpt/titans/dataset/webtext.py    | 42 +++++++-------
 examples/language/gpt/titans/run.sh           |  3 +-
 examples/language/gpt/titans/train_gpt.py     | 55 ++++---------------
 3 files changed, 35 insertions(+), 65 deletions(-)

diff --git a/examples/language/gpt/titans/dataset/webtext.py b/examples/language/gpt/titans/dataset/webtext.py
index 09d8870b530b..64f5944a97f9 100644
--- a/examples/language/gpt/titans/dataset/webtext.py
+++ b/examples/language/gpt/titans/dataset/webtext.py
@@ -1,5 +1,6 @@
 import json
 import os
+from typing import Optional
 
 import torch
 from torch.utils.data import Dataset
@@ -11,26 +12,29 @@
 @DATASETS.register_module
 class WebtextDataset(Dataset):
 
-    def __init__(self, path, seq_len=1024) -> None:
+    def __init__(self, path: Optional[str] = None, seq_len=1024) -> None:
         super().__init__()
-        root = os.path.dirname(path)
-        encoded_data_cache_path = os.path.join(root, f'gpt_webtext_{seq_len}.pt')
-        if os.path.isfile(encoded_data_cache_path):
-            seq_len_, data, attention_mask = torch.load(encoded_data_cache_path)
-            if seq_len_ == seq_len:
-                self.data = data
-                self.attention_mask = attention_mask
-                return
-        raw_data = []
-        with open(path) as f:
-            for line in f.readlines():
-                raw_data.append(json.loads(line)['text'])
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        tokenizer.pad_token = tokenizer.unk_token
-        encoded_data = tokenizer(raw_data, padding=True, truncation=True, max_length=seq_len, return_tensors='pt')
-        self.data = encoded_data['input_ids']
-        self.attention_mask = encoded_data['attention_mask']
-        torch.save((seq_len, self.data, self.attention_mask), encoded_data_cache_path)
+        if path is not None:
+            root = os.path.dirname(path)
+            encoded_data_cache_path = os.path.join(root, f'gpt_webtext_{seq_len}.pt')
+            if os.path.isfile(encoded_data_cache_path):
+                seq_len_, data, attention_mask = torch.load(encoded_data_cache_path)
+                if seq_len_ == seq_len:
+                    self.data = data
+                    self.attention_mask = attention_mask
+                    return
+            raw_data = []
+            with open(path) as f:
+                for line in f.readlines():
+                    raw_data.append(json.loads(line)['text'])
+            tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+            tokenizer.pad_token = tokenizer.unk_token
+            encoded_data = tokenizer(raw_data, padding=True, truncation=True, max_length=seq_len, return_tensors='pt')
+            self.data = encoded_data['input_ids']
+            self.attention_mask = encoded_data['attention_mask']
+        else:
+            self.data = torch.randint(0, 50257, (10240, seq_len))
+            self.attention_mask = torch.ones_like(self.data)
 
     def __len__(self):
         return len(self.data)
diff --git a/examples/language/gpt/titans/run.sh b/examples/language/gpt/titans/run.sh
index 157bd377aa34..a1a7fc737db0 100644
--- a/examples/language/gpt/titans/run.sh
+++ b/examples/language/gpt/titans/run.sh
@@ -1,2 +1,3 @@
 export DATA=/data/scratch/gpt_data/small-gpt-dataset.json
-colossalai run --nproc_per_node=4 train_gpt.py --config ./configs/gpt2_small_zero3_pp1d.py --from_torch
+DUMMY_DATA=--use_dummy_dataset
+colossalai run --nproc_per_node=2 train_gpt.py --config ./configs/gpt2_small_zero3_pp1d.py --from_torch $DUMMY_DATA
diff --git a/examples/language/gpt/titans/train_gpt.py b/examples/language/gpt/titans/train_gpt.py
index 4db7a081fc17..66225d6c8044 100644
--- a/examples/language/gpt/titans/train_gpt.py
+++ b/examples/language/gpt/titans/train_gpt.py
@@ -3,6 +3,7 @@
 
 import torch
 import torch.nn as nn
+from dataset.webtext import WebtextDataset
 from titans.model.gpt import GPTLMLoss
 
 import colossalai
@@ -39,52 +40,16 @@ def main():
         colossalai.launch_from_slurm(config=args.config, host=args.host, port=29500, seed=42)
     logger = get_dist_logger()
 
-    if not args.use_dummy_dataset:
-        data_path = os.environ['DATA']
-        logger.info(f'Build data loader from path {data_path}', ranks=[0])
-        from dataset.webtext import WebtextDataset
-        train_ds = WebtextDataset(os.environ['DATA'], seq_len=gpc.config.SEQ_LEN)
-        train_dataloader = utils.get_dataloader(train_ds,
-                                                seed=42,
-                                                batch_size=gpc.config.BATCH_SIZE,
-                                                pin_memory=True,
-                                                shuffle=True,
-                                                drop_last=True)
-    else:
-        # build a dummy train_dataloader
-        logger.info('Build data loader using dummy data', ranks=[0])
-
-        def get_data(batch_size, seq_len, vocab_size):
-            input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device())
-            attention_mask = torch.ones_like(input_ids)
-            return input_ids, attention_mask
-
-        # 10 iterations
-        input_ids, attn_mask = get_data(gpc.config.BATCH_SIZE * 10, gpc.config.SEQ_LEN, VOCAB_SIZE)
-        from torch.utils.data import DataLoader, Dataset
-
-        class TextSamplerDataset(Dataset):
-
-            def __init__(self, data, seq_len):
-                super().__init__()
-                self.data = data
-                self.seq_len = seq_len
-
-            def __getitem__(self, index):
-                rand_start = torch.randint(0, self.data.size(0) - self.seq_len, (1,))
-                full_seq = self.data[rand_start:rand_start + self.seq_len + 1].long()
-                return full_seq.cuda()
-
-            def __len__(self):
-                return self.data.size(0) // self.seq_len
-
-        def cycle(loader):
-            while True:
-                for data in loader:
-                    yield data
+    data_path = None if args.use_dummy_dataset else os.environ['DATA']
+    logger.info(f'Build data loader from path {data_path}', ranks=[0])
 
-        train_dataset = TextSamplerDataset(input_ids, gpc.config.SEQ_LEN)
-        train_dataloader = DataLoader(train_dataset, batch_size=gpc.config.BATCH_SIZE)
+    train_ds = WebtextDataset(path=data_path, seq_len=gpc.config.SEQ_LEN)
+    train_dataloader = utils.get_dataloader(train_ds,
+                                            seed=42,
+                                            batch_size=gpc.config.BATCH_SIZE,
+                                            pin_memory=True,
+                                            shuffle=True,
+                                            drop_last=True)
 
     logger.info('Build model', ranks=[0])
     use_pipeline = is_using_pp()

From e327e95144f4db8875531699e5b048f77cb80eba Mon Sep 17 00:00:00 2001
From: Jiarui Fang <fangjiarui123@gmail.com>
Date: Wed, 18 Jan 2023 12:04:18 +0800
Subject: [PATCH 197/209] [hotfix] gpt example titans bug #2493 (#2494)

---
 .../titans/configs/gpt2_small_zero3_pp1d.py   |  4 +-
 .../language/gpt/titans/dataset/webtext.py    | 43 ++++++++++++++
 examples/language/gpt/titans/run.sh           |  3 +-
 examples/language/gpt/titans/train_gpt.py     | 57 ++++---------------
 4 files changed, 58 insertions(+), 49 deletions(-)
 create mode 100644 examples/language/gpt/titans/dataset/webtext.py

diff --git a/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py b/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py
index 8ef81cb0a14f..7bf53303948a 100644
--- a/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py
+++ b/examples/language/gpt/titans/configs/gpt2_small_zero3_pp1d.py
@@ -12,11 +12,11 @@
 
 # if you do no want zero, just comment out this dictionary
 zero = dict(model_config=dict(tensor_placement_policy='cuda', shard_strategy=TensorShardStrategy()),
-            optimizer_config=dict(initial_scale=2**16))
+            optimizer_config=dict(initial_scale=2**5))
 
 optimizer = dict(
     type=HybridAdam,
-    lr=0.00015,
+    lr=0.000015,
     weight_decay=1e-2,
 )
 
diff --git a/examples/language/gpt/titans/dataset/webtext.py b/examples/language/gpt/titans/dataset/webtext.py
new file mode 100644
index 000000000000..64f5944a97f9
--- /dev/null
+++ b/examples/language/gpt/titans/dataset/webtext.py
@@ -0,0 +1,43 @@
+import json
+import os
+from typing import Optional
+
+import torch
+from torch.utils.data import Dataset
+from transformers import GPT2Tokenizer
+
+from colossalai.registry import DATASETS
+
+
+@DATASETS.register_module
+class WebtextDataset(Dataset):
+
+    def __init__(self, path: Optional[str] = None, seq_len=1024) -> None:
+        super().__init__()
+        if path is not None:
+            root = os.path.dirname(path)
+            encoded_data_cache_path = os.path.join(root, f'gpt_webtext_{seq_len}.pt')
+            if os.path.isfile(encoded_data_cache_path):
+                seq_len_, data, attention_mask = torch.load(encoded_data_cache_path)
+                if seq_len_ == seq_len:
+                    self.data = data
+                    self.attention_mask = attention_mask
+                    return
+            raw_data = []
+            with open(path) as f:
+                for line in f.readlines():
+                    raw_data.append(json.loads(line)['text'])
+            tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+            tokenizer.pad_token = tokenizer.unk_token
+            encoded_data = tokenizer(raw_data, padding=True, truncation=True, max_length=seq_len, return_tensors='pt')
+            self.data = encoded_data['input_ids']
+            self.attention_mask = encoded_data['attention_mask']
+        else:
+            self.data = torch.randint(0, 50257, (10240, seq_len))
+            self.attention_mask = torch.ones_like(self.data)
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        return {'input_ids': self.data[index], 'attention_mask': self.attention_mask[index]}, self.data[index]
diff --git a/examples/language/gpt/titans/run.sh b/examples/language/gpt/titans/run.sh
index 157bd377aa34..a1a7fc737db0 100644
--- a/examples/language/gpt/titans/run.sh
+++ b/examples/language/gpt/titans/run.sh
@@ -1,2 +1,3 @@
 export DATA=/data/scratch/gpt_data/small-gpt-dataset.json
-colossalai run --nproc_per_node=4 train_gpt.py --config ./configs/gpt2_small_zero3_pp1d.py --from_torch
+DUMMY_DATA=--use_dummy_dataset
+colossalai run --nproc_per_node=2 train_gpt.py --config ./configs/gpt2_small_zero3_pp1d.py --from_torch $DUMMY_DATA
diff --git a/examples/language/gpt/titans/train_gpt.py b/examples/language/gpt/titans/train_gpt.py
index 1380b4b3a7da..66225d6c8044 100644
--- a/examples/language/gpt/titans/train_gpt.py
+++ b/examples/language/gpt/titans/train_gpt.py
@@ -3,6 +3,7 @@
 
 import torch
 import torch.nn as nn
+from dataset.webtext import WebtextDataset
 from titans.model.gpt import GPTLMLoss
 
 import colossalai
@@ -30,7 +31,7 @@ def calc_local_model_size(model: torch.nn.Module):
 def main():
     parser = colossalai.get_default_parser()
     parser.add_argument('--from_torch', default=False, action='store_true')
-    parser.add_argument('--use_dummy_dataset', default=True, action='store_true')
+    parser.add_argument('--use_dummy_dataset', default=False, action='store_true')
     args = parser.parse_args()
     disable_existing_loggers()
     if args.from_torch:
@@ -39,52 +40,16 @@ def main():
         colossalai.launch_from_slurm(config=args.config, host=args.host, port=29500, seed=42)
     logger = get_dist_logger()
 
-    if not args.use_dummy_dataset:
-        data_path = os.environ['DATA']
-        logger.info(f'Build data loader from path {data_path}', ranks=[0])
-        from dataset.webtext import WebtextDataset
-        train_ds = WebtextDataset(os.environ['DATA'], seq_len=gpc.config.SEQ_LEN)
-        train_dataloader = utils.get_dataloader(train_ds,
-                                                seed=42,
-                                                batch_size=gpc.config.BATCH_SIZE,
-                                                pin_memory=True,
-                                                shuffle=True,
-                                                drop_last=True)
-    else:
-        # build a dummy train_dataloader
-        logger.info('Build data loader using dummy data', ranks=[0])
-
-        def get_data(batch_size, seq_len, vocab_size):
-            input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device())
-            attention_mask = torch.ones_like(input_ids)
-            return input_ids, attention_mask
-
-        # 10 iterations
-        input_ids, attn_mask = get_data(gpc.config.BATCH_SIZE * 10, gpc.config.SEQ_LEN, VOCAB_SIZE)
-        from torch.utils.data import DataLoader, Dataset
-
-        class TextSamplerDataset(Dataset):
-
-            def __init__(self, data, seq_len):
-                super().__init__()
-                self.data = data
-                self.seq_len = seq_len
-
-            def __getitem__(self, index):
-                rand_start = torch.randint(0, self.data.size(0) - self.seq_len, (1,))
-                full_seq = self.data[rand_start:rand_start + self.seq_len + 1].long()
-                return full_seq.cuda()
-
-            def __len__(self):
-                return self.data.size(0) // self.seq_len
-
-        def cycle(loader):
-            while True:
-                for data in loader:
-                    yield data
+    data_path = None if args.use_dummy_dataset else os.environ['DATA']
+    logger.info(f'Build data loader from path {data_path}', ranks=[0])
 
-        train_dataset = TextSamplerDataset(input_ids, gpc.config.SEQ_LEN)
-        train_dataloader = DataLoader(train_dataset, batch_size=gpc.config.BATCH_SIZE)
+    train_ds = WebtextDataset(path=data_path, seq_len=gpc.config.SEQ_LEN)
+    train_dataloader = utils.get_dataloader(train_ds,
+                                            seed=42,
+                                            batch_size=gpc.config.BATCH_SIZE,
+                                            pin_memory=True,
+                                            shuffle=True,
+                                            drop_last=True)
 
     logger.info('Build model', ranks=[0])
     use_pipeline = is_using_pp()

From 5db3a5bf42a7f8c5fa00141d95fbac633bce4b37 Mon Sep 17 00:00:00 2001
From: oahzxl <43881818+oahzxl@users.noreply.github.com>
Date: Wed, 18 Jan 2023 17:02:46 +0800
Subject: [PATCH 198/209] [fx] allow control of ckpt_codegen init (#2498)

* [fx] allow control of ckpt_codegen init

Currently in ColoGraphModule, ActivationCheckpointCodeGen will be set automatically in __init__. But other codegen can't be set if so.
So I add an arg to control whether to set ActivationCheckpointCodeGen in __init__.

* code style
---
 colossalai/fx/graph_module.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/colossalai/fx/graph_module.py b/colossalai/fx/graph_module.py
index 2d6a71f19e16..ebb9975f27db 100644
--- a/colossalai/fx/graph_module.py
+++ b/colossalai/fx/graph_module.py
@@ -22,8 +22,13 @@
 
     class ColoGraphModule(GraphModule):
 
-        def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph, class_name: str = 'GraphModule'):
-            graph.set_codegen(ActivationCheckpointCodeGen())
+        def __init__(self,
+                     root: Union[torch.nn.Module, Dict[str, Any]],
+                     graph: Graph,
+                     class_name: str = 'GraphModule',
+                     ckpt_codegen: bool = True):
+            if ckpt_codegen:
+                graph.set_codegen(ActivationCheckpointCodeGen())
             super().__init__(root, graph, class_name)
 
         def bind(self, ckpt_def, globals):

From 025b482dc17c46df3d622bec5d793d22b9fca584 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Wed, 18 Jan 2023 18:42:56 +0800
Subject: [PATCH 199/209] [example] dreambooth example

---
 .../dreambooth/train_dreambooth_colossalai.py | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/examples/images/dreambooth/train_dreambooth_colossalai.py b/examples/images/dreambooth/train_dreambooth_colossalai.py
index 7c90b939abaa..9c72c06e79fe 100644
--- a/examples/images/dreambooth/train_dreambooth_colossalai.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai.py
@@ -153,7 +153,8 @@ def parse_args(input_args=None):
         "--gradient_accumulation_steps",
         type=int,
         default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
+        help=
+        "Number of updates steps to accumulate before performing a backward/update pass. If using Gemini, it must be 1",
     )
     parser.add_argument(
         "--gradient_checkpointing",
@@ -361,6 +362,9 @@ def main(args):
     else:
         colossalai.launch_from_torch(config={}, seed=args.seed)
 
+    local_rank = gpc.get_local_rank(ParallelMode.DATA)
+    world_size = gpc.get_world_size(ParallelMode.DATA)
+
     if args.with_prior_preservation:
         class_images_dir = Path(args.class_data_dir)
         if not class_images_dir.exists():
@@ -388,7 +392,7 @@ def main(args):
             for example in tqdm(
                     sample_dataloader,
                     desc="Generating class images",
-                    disable=not gpc.get_local_rank(ParallelMode.DATA) == 0,
+                    disable=not local_rank == 0,
             ):
                 images = pipeline(example["prompt"]).images
 
@@ -400,7 +404,7 @@ def main(args):
             del pipeline
 
     # Handle the repository creation
-    if gpc.get_local_rank(ParallelMode.DATA) == 0:
+    if local_rank == 0:
         if args.push_to_hub:
             if args.hub_model_id is None:
                 repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
@@ -465,8 +469,9 @@ def main(args):
     if args.gradient_checkpointing:
         unet.enable_gradient_checkpointing()
 
+    assert args.gradient_accumulation_steps == 1, "if using ColossalAI gradient_accumulation_steps must be set to 1."
     if args.scale_lr:
-        args.learning_rate = args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * gpc.get_world_size(ParallelMode.DATA)
+        args.learning_rate = args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * world_size
 
     unet = gemini_zero_dpp(unet, args.placement)
 
@@ -555,7 +560,7 @@ def collate_fn(examples):
     args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
 
     # Train!
-    total_batch_size = args.train_batch_size * gpc.get_world_size(ParallelMode.DATA) * args.gradient_accumulation_steps
+    total_batch_size = args.train_batch_size * world_size * args.gradient_accumulation_steps
 
     logger.info("***** Running training *****", ranks=[0])
     logger.info(f"  Num examples = {len(train_dataset)}", ranks=[0])
@@ -567,7 +572,7 @@ def collate_fn(examples):
     logger.info(f"  Total optimization steps = {args.max_train_steps}", ranks=[0])
 
     # Only show the progress bar once on each machine.
-    progress_bar = tqdm(range(args.max_train_steps), disable=not gpc.get_local_rank(ParallelMode.DATA) == 0)
+    progress_bar = tqdm(range(args.max_train_steps), disable=not local_rank == 0)
     progress_bar.set_description("Steps")
     global_step = 0
 
@@ -644,7 +649,7 @@ def collate_fn(examples):
             if global_step % args.save_steps == 0:
                 torch.cuda.synchronize()
                 torch_unet = get_static_torch_model(unet)
-                if gpc.get_local_rank(ParallelMode.DATA) == 0:
+                if local_rank == 0:
                     pipeline = DiffusionPipeline.from_pretrained(
                         args.pretrained_model_name_or_path,
                         unet=torch_unet,
@@ -659,7 +664,7 @@ def collate_fn(examples):
     torch.cuda.synchronize()
     unet = get_static_torch_model(unet)
 
-    if gpc.get_local_rank(ParallelMode.DATA) == 0:
+    if local_rank == 0:
         pipeline = DiffusionPipeline.from_pretrained(
             args.pretrained_model_name_or_path,
             unet=unet,

From 32390cbe8fa69e5d9df4228c5e4671257b2cc739 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Thu, 19 Jan 2023 09:46:28 +0800
Subject: [PATCH 200/209] add test_ci.sh to dreambooth

---
 examples/images/dreambooth/test_ci.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 examples/images/dreambooth/test_ci.sh

diff --git a/examples/images/dreambooth/test_ci.sh b/examples/images/dreambooth/test_ci.sh
new file mode 100644
index 000000000000..e69de29bb2d1

From ecccc91f21edb1a9660215c7c1f62dae31fb0629 Mon Sep 17 00:00:00 2001
From: oahzxl <43881818+oahzxl@users.noreply.github.com>
Date: Thu, 19 Jan 2023 11:41:00 +0800
Subject: [PATCH 201/209] [autochunk] support autochunk on evoformer (#2497)

---
 colossalai/autochunk/autochunk_codegen.py     |  6 +-
 colossalai/autochunk/estimate_memory.py       | 67 +++++----------
 colossalai/autochunk/search_chunk.py          | 83 +++++--------------
 colossalai/autochunk/trace_flow.py            | 63 +++++++++++---
 colossalai/autochunk/trace_indice.py          | 76 +++++++++++++----
 colossalai/autochunk/utils.py                 | 27 +++---
 .../test_autochunk/test_evoformer_codegen.py  | 21 +++--
 .../test_simple_evoformer_codegen.py          | 39 +++------
 .../test_simple_evoformer_search.py           |  6 +-
 9 files changed, 200 insertions(+), 188 deletions(-)

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index ceccb9a9fde2..de5e7356bbfd 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -123,12 +123,13 @@ def _replace_name(context: str, name_from: str, name_to: str) -> str:
     """
     replace node name
     """
-    patterns = [(" ", " "), (" ", "."), (" ", ","), ("(", ")"), ("(", ","), (" ", ")")]
+    patterns = [(" ", " "), (" ", "."), (" ", ","), ("(", ")"), ("(", ","), (" ", ")"), (" ", ""), ("", " ")]
     for p in patterns:
         source = p[0] + name_from + p[1]
         target = p[0] + name_to + p[1]
         if source in context:
             context = context.replace(source, target)
+            break
     return context
 
 
@@ -138,8 +139,7 @@ def _replace_reshape_size(context: str, node_name: str, reshape_size_dict: Dict)
     """
     if node_name not in reshape_size_dict:
         return context
-    for size_name, size_value in reshape_size_dict[node_name].items():
-        context = context.replace(size_name, size_value)
+    context = context.replace(reshape_size_dict[node_name][0], reshape_size_dict[node_name][1])
     return context
 
 
diff --git a/colossalai/autochunk/estimate_memory.py b/colossalai/autochunk/estimate_memory.py
index e001423f1fbb..d386253850a7 100644
--- a/colossalai/autochunk/estimate_memory.py
+++ b/colossalai/autochunk/estimate_memory.py
@@ -37,10 +37,10 @@ def _get_output_node_size(self, n):
 
     def _add_active_node(self, n, active_list):
         new_active = self._get_output_node(n)[1]
-        if n.op == "placeholder":
+        if n.op == "placeholder" and get_node_shape(n) is not None:
             new_active.append(n.name)
         for i in new_active:
-            if i not in active_list:
+            if i not in active_list and get_node_shape(n) is not None:
                 active_list.append(i)
 
     def _get_delete_node(self, user, user_to_last_uses, to_keep=None):
@@ -77,15 +77,11 @@ def _remove_deactive_node(self, user, user_to_last_uses, active_list):
             if i in active_list:
                 active_list.remove(i)
 
-    def _get_chunk_inputs_size(
-        self, chunk_inputs, chunk_inputs_non_chunk, node_list, chunk_end_idx
-    ):
+    def _get_chunk_inputs_size(self, chunk_inputs, chunk_inputs_non_chunk, node_list, chunk_end_idx):
         nodes_to_delete = []
         for chunk_input in chunk_inputs + chunk_inputs_non_chunk:
             chunk_input_users = chunk_input.users.keys()
-            chunk_input_users_idx = [
-                find_idx_by_name(i.name, node_list) for i in chunk_input_users
-            ]
+            chunk_input_users_idx = [find_idx_by_name(i.name, node_list) for i in chunk_input_users]
             if all(i <= chunk_end_idx for i in chunk_input_users_idx):
                 if chunk_input not in nodes_to_delete:
                     nodes_to_delete.append(chunk_input)
@@ -112,9 +108,7 @@ def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
         not_contiguous_ops = ["permute"]
         inherit_contiguous_ops = ["transpose", "view"]
 
-        if node.op == "call_function" and any(
-            n in node.name for n in ["matmul", "reshape"]
-        ):
+        if node.op == "call_function" and any(n in node.name for n in ["matmul", "reshape"]):
             for n in node.args:
                 if n in not_contiguous_list:
                     # matmul won't change origin tensor, but create a tmp copy
@@ -125,9 +119,7 @@ def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
                     # module will just make origin tensor to contiguous
                     if delete:
                         not_contiguous_list.remove(n)
-        elif node.op == "call_method" and any(
-            i in node.name for i in not_contiguous_ops
-        ):
+        elif node.op == "call_method" and any(i in node.name for i in not_contiguous_ops):
             if node not in not_contiguous_list:
                 not_contiguous_list.append(node)
         return mem
@@ -142,9 +134,7 @@ def _get_chunk_ratio(self, node, chunk_node_dim, chunk_size):
         else:
             return float(chunk_size) / node_shape[chunk_dim]
 
-    def _get_chunk_delete_node_size(
-        self, user, user_to_last_uses, chunk_ratio, chunk_inputs_names
-    ):
+    def _get_chunk_delete_node_size(self, user, user_to_last_uses, chunk_ratio, chunk_inputs_names):
         # if any(j in user.name for j in ['transpose', 'permute', 'view']):
         #     return 0
         if user.op in ("placeholder", "output"):
@@ -196,7 +186,7 @@ def estimate_chunk_inference_mem(
         Returns:
             act_memory_peak_log (List): peak memory of every node
             act_memory_after_node_log (List): memory after excuting every node
-            active_node_list_log (List): active nodes of every node. active nodes refer to 
+            active_node_list_log (List): active nodes of every node. active nodes refer to
                 nodes generated but not deleted.
         """
         act_memory = 0.0
@@ -212,7 +202,7 @@ def estimate_chunk_inference_mem(
         use_chunk = True if chunk_infos is not None else False
         chunk_within = False
         chunk_region_idx = None
-        chunk_ratio = 1  # use it to estimate chunk mem
+        chunk_ratio = 1    # use it to estimate chunk mem
         chunk_inputs_names = []
 
         if use_chunk:
@@ -221,23 +211,18 @@ def estimate_chunk_inference_mem(
             chunk_ends = [i[1] for i in chunk_regions]
             chunk_inputs = [i["inputs"] for i in chunk_infos]
             chunk_inputs_non_chunk = [i["inputs_non_chunk"] for i in chunk_infos]
-            chunk_inputs_names = [j.name for i in chunk_inputs for j in i] + [
-                j.name for i in chunk_inputs_non_chunk for j in i
-            ]
+            chunk_inputs_names = [j.name for i in chunk_inputs for j in i
+                                 ] + [j.name for i in chunk_inputs_non_chunk for j in i]
             chunk_outputs = [i["outputs"][0] for i in chunk_infos]
             chunk_node_dim = [i["node_chunk_dim"] for i in chunk_infos]
-            chunk_sizes = [
-                i["chunk_size"] if "chunk_size" in i else 1 for i in chunk_infos
-            ]
+            chunk_sizes = [i["chunk_size"] if "chunk_size" in i else 1 for i in chunk_infos]
 
         for idx, node in enumerate(node_list):
             # if node in chunk start nodes, change chunk ratio and add chunk_tensor
             if use_chunk and idx in chunk_starts:
                 chunk_within = True
                 chunk_region_idx = chunk_starts.index(idx)
-                act_memory += self._get_output_node_size(
-                    chunk_outputs[chunk_region_idx]
-                ) / (1024**2)
+                act_memory += self._get_output_node_size(chunk_outputs[chunk_region_idx]) / (1024**2)
 
             # determine chunk ratio for current node
             if chunk_within:
@@ -262,22 +247,13 @@ def estimate_chunk_inference_mem(
             else:
                 # forward memory
                 # TODO: contiguous_memory still not accurate for matmul, view, reshape and transpose
-                act_memory += (
-                    self._get_contiguous_memory(node, not_contiguous_list)
-                    * chunk_ratio
-                    / (1024**2)
-                )
-                act_memory += (
-                    self._get_output_node_size(node) * chunk_ratio / (1024**2)
-                )
+                act_memory += (self._get_contiguous_memory(node, not_contiguous_list) * chunk_ratio / (1024**2))
+                act_memory += (self._get_output_node_size(node) * chunk_ratio / (1024**2))
                 # record max act memory
                 act_memory_peak_log.append(act_memory)
                 # delete useless memory
-                act_memory -= (
-                    self._get_contiguous_memory(node, not_contiguous_list, delete=True)
-                    * chunk_ratio
-                    / (1024**2)
-                )
+                act_memory -= (self._get_contiguous_memory(node, not_contiguous_list, delete=True) * chunk_ratio /
+                               (1024**2))
                 # delete unused vars not in chunk_input_list
                 # we can't delete input nodes until chunk ends
                 if chunk_within:
@@ -288,9 +264,8 @@ def estimate_chunk_inference_mem(
                         chunk_inputs_names,
                     ) / (1024**2)
                 else:
-                    act_memory -= self._get_delete_node_size(
-                        node, user_to_last_uses_no_free_var, chunk_inputs_names
-                    ) / (1024**2)
+                    act_memory -= self._get_delete_node_size(node, user_to_last_uses_no_free_var,
+                                                             chunk_inputs_names) / (1024**2)
 
             # log active node, only effective without chunk
             self._add_active_node(node, active_node_list)
@@ -298,9 +273,7 @@ def estimate_chunk_inference_mem(
 
             # if node in chunk end nodes, restore chunk settings
             if use_chunk and idx in chunk_ends:
-                act_memory -= (
-                    self._get_output_node_size(node) * chunk_ratio / (1024**2)
-                )
+                act_memory -= (self._get_output_node_size(node) * chunk_ratio / (1024**2))
                 act_memory -= self._get_chunk_inputs_size(
                     chunk_inputs[chunk_region_idx],
                     chunk_inputs_non_chunk[chunk_region_idx],
diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index c9e5e5172274..236f9697df5d 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -8,11 +8,7 @@
 from .select_chunk import SelectChunk
 from .trace_flow import TraceFlow
 from .trace_indice import TraceIndice
-from .utils import (
-    get_node_shape,
-    is_non_compute_node,
-    is_non_compute_node_except_placeholder,
-)
+from .utils import get_node_shape, is_non_compute_node, is_non_compute_node_except_placeholder
 
 
 class SearchChunk(object):
@@ -73,13 +69,11 @@ def _get_free_var_idx(self) -> List:
         """
         free_var_idx = []
         for idx, n in enumerate(self.trace_indice.node_list):
-            if n.op == "placeholder":
+            if n.op == "placeholder" and get_node_shape(n) is not None:
                 free_var_idx.append(idx)
         return free_var_idx
 
-    def _search_max_chunk_region(
-        self, active_node: List, peak_node: Node, chunk_regions: List
-    ) -> Tuple:
+    def _search_max_chunk_region(self, active_node: List, peak_node: Node, chunk_regions: List) -> Tuple:
         """
         Search max chunk region according to peak memory node
 
@@ -124,15 +118,9 @@ def _search_max_chunk_region(
             region = i["region"]
             if chunk_region_start >= region[0] and chunk_region_end <= region[1]:
                 return None
-            elif (
-                region[0] <= chunk_region_start <= region[1]
-                and chunk_region_end > region[1]
-            ):
+            elif (region[0] <= chunk_region_start <= region[1] and chunk_region_end > region[1]):
                 chunk_region_start = region[1] + 1
-            elif (
-                region[0] <= chunk_region_end <= region[1]
-                and chunk_region_start < region[0]
-            ):
+            elif (region[0] <= chunk_region_end <= region[1] and chunk_region_start < region[0]):
                 chunk_region_end = region[0] - 1
         return chunk_region_start, chunk_region_end
 
@@ -164,25 +152,16 @@ def _find_chunk_info(self, input_trace, output_trace, start_idx, end_idx) -> Lis
             for start_node, start_trace in start_traces.items():
                 for start_dim, _ in enumerate(start_trace["indice"]):
                     # dim size cannot be 1
-                    if (
-                        get_node_shape(end_node)[end_dim] == 1
-                        or get_node_shape(start_node)[start_dim] == 1
-                    ):
+                    if (get_node_shape(end_node)[end_dim] == 1 or get_node_shape(start_node)[start_dim] == 1):
                         continue
                     # check index source align
-                    if not self.trace_flow.check_index_source(
-                        start_dim, start_node, start_idx, end_dim, end_node
-                    ):
+                    if not self.trace_flow.check_index_source(start_dim, start_node, start_idx, end_dim, end_node):
                         continue
                     # check index copmute
-                    if not self.trace_flow.check_index_compute(
-                        start_idx, end_dim, end_node, end_idx
-                    ):
+                    if not self.trace_flow.check_index_compute(start_idx, end_dim, end_node, end_idx):
                         continue
                     # flow search
-                    chunk_info = self.trace_flow.flow_search(
-                        start_idx, start_dim, end_idx, end_dim
-                    )
+                    chunk_info = self.trace_flow.flow_search(start_idx, start_dim, end_idx, end_dim)
                     if chunk_info is None:
                         continue
                     # check index copmute
@@ -191,9 +170,7 @@ def _find_chunk_info(self, input_trace, output_trace, start_idx, end_idx) -> Lis
                     chunk_infos.append(chunk_info)
         return chunk_infos
 
-    def _search_possible_chunk_regions(
-        self, max_chunk_region: Tuple, peak_node: Node
-    ) -> List:
+    def _search_possible_chunk_regions(self, max_chunk_region: Tuple, peak_node: Node) -> List:
         """
         Search every possible region within the max chunk region.
 
@@ -206,28 +183,23 @@ def _search_possible_chunk_regions(
         """
         possible_chunk_region = []
         output_trace = copy.deepcopy(self.trace_indice.indice_trace_list)
-        input_trace = []  # trace of a node's input nodes
+        input_trace = []    # trace of a node's input nodes
         for _, n in enumerate(self.trace_indice.node_list):
             cur_trace = {}
             for arg in n.args:
-                if type(arg) == type(n) and not is_non_compute_node_except_placeholder(
-                    arg
-                ):
+                if type(arg) == type(n) and not is_non_compute_node_except_placeholder(arg):
                     cur_trace[arg] = self.trace_indice._find_trace_from_node(arg)
             input_trace.append(cur_trace)
 
         for start_idx in range(max_chunk_region[0], peak_node + 1):
             for end_idx in range(peak_node, max_chunk_region[1] + 1):
                 # skip non compute nodes
-                if is_non_compute_node(
-                    self.trace_indice.node_list[start_idx]
-                ) or is_non_compute_node(self.trace_indice.node_list[end_idx]):
+                if is_non_compute_node(self.trace_indice.node_list[start_idx]) or is_non_compute_node(
+                        self.trace_indice.node_list[end_idx]):
                     continue
 
                 # select free dim
-                chunk_info = self._find_chunk_info(
-                    input_trace, output_trace, start_idx, end_idx
-                )
+                chunk_info = self._find_chunk_info(input_trace, output_trace, start_idx, end_idx)
                 if len(chunk_info) > 0:
                     possible_chunk_region.extend(chunk_info)
         return possible_chunk_region
@@ -256,17 +228,12 @@ def _step_search(
             best_chunk_region (Dict)
         """
         peak_node = self._find_peak_node(mem_peak)
-        max_chunk_region = self._search_max_chunk_region(
-            active_node, peak_node, chunk_infos
-        )
+        max_chunk_region = self._search_max_chunk_region(active_node, peak_node, chunk_infos)
         if max_chunk_region == None:
             return None
-        possible_chunk_regions = self._search_possible_chunk_regions(
-            max_chunk_region, peak_node
-        )
-        best_chunk_region = self.select_chunk._select_best_chunk_region(
-            possible_chunk_regions, chunk_infos, peak_node, max_chunk_region, mem_peak
-        )
+        possible_chunk_regions = self._search_possible_chunk_regions(max_chunk_region, peak_node)
+        best_chunk_region = self.select_chunk._select_best_chunk_region(possible_chunk_regions, chunk_infos, peak_node,
+                                                                        max_chunk_region, mem_peak)
         best_chunk_region = self.reorder_graph.reorder_all(best_chunk_region)
         return best_chunk_region
 
@@ -291,9 +258,7 @@ def search_region(self) -> Dict:
             init_mem_peak,
             _,
             active_node,
-        ) = self.estimate_memory.estimate_chunk_inference_mem(
-            self.trace_indice.node_list
-        )
+        ) = self.estimate_memory.estimate_chunk_inference_mem(self.trace_indice.node_list)
         mem_peak = init_mem_peak
 
         while True:
@@ -306,14 +271,10 @@ def search_region(self) -> Dict:
                 mem_peak,
                 _,
                 active_node,
-            ) = self.estimate_memory.estimate_chunk_inference_mem(
-                self.trace_indice.node_list, chunk_infos
-            )
+            ) = self.estimate_memory.estimate_chunk_inference_mem(self.trace_indice.node_list, chunk_infos)
             if self._stop_search(init_mem_peak, mem_peak):
                 break
         if self.print_mem:
             self.print_mem = False
-            self.estimate_memory.estimate_chunk_inference_mem(
-                self.trace_indice.node_list, chunk_infos, print_mem=True
-            )
+            self.estimate_memory.estimate_chunk_inference_mem(self.trace_indice.node_list, chunk_infos, print_mem=True)
         return chunk_infos
diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py
index ec1e012beb17..04fa2b3bb480 100644
--- a/colossalai/autochunk/trace_flow.py
+++ b/colossalai/autochunk/trace_flow.py
@@ -1,8 +1,13 @@
+from typing import Dict, List, Tuple
+
+from torch.fx.node import Node
+
 from .trace_indice import TraceIndice
 from .utils import (
     find_chunk_all_input_nodes,
     find_chunk_compute_input_and_output_nodes,
     find_idx_by_name,
+    flat_list,
     get_node_shape,
     is_non_compute_node,
     is_non_compute_node_except_placeholder,
@@ -171,7 +176,7 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx):
                 # get cur node info
                 cur_node_chunk_dim = all_node_info[cur_node]["chunk_dim"]
                 cur_node_fix_dim = all_node_info[cur_node]["fix_dim"]
-                if cur_node_chunk_dim:
+                if cur_node_chunk_dim is not None:
                     cur_node_compute = self.trace_indice._find_compute_trace_from_node(cur_node)
                     cur_node_source = self.trace_indice._find_source_trace_from_node(cur_node)
                 else:
@@ -223,15 +228,32 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx):
             cur_node_list = next_node_list
         return all_node_info
 
-    def _get_input_nodes_dim(self, inputs, start_idx, end_idx, all_node_info):
+    def _get_input_nodes_dim(self, inputs: List[Node], start_idx: int, end_idx: int, all_node_info: Dict) -> Tuple:
+        """
+        Get chunk dim for every input node for their every entry, remove unchunked nodes
+
+        Args:
+            inputs (List[Node]): input nodes
+            all_node_info (Dict): describe all node's chunk dim and fix dim
+            start_idx (int): chunk start idx
+            end_idx (int): chunk end idx
+
+        Returns:
+            inputs (List(Node)): new inputs
+            inputs_dim (List): chunk dim for inputs
+        """
         inputs_dim = []
         remove_inputs = []
         for input_node in inputs:
             input_dict = {}
             input_node_idx = find_idx_by_name(input_node.name, self.trace_indice.node_list)
             for user in input_node.users.keys():
+                # skip non compute
                 if is_non_compute_node(user):
                     continue
+                # untraced node, mostly non compute
+                if user not in all_node_info:
+                    continue
                 user_idx = find_idx_by_name(user.name, self.trace_indice.node_list)
                 if start_idx <= user_idx <= end_idx:
                     chunk_dim = all_node_info[user]["chunk_dim"]
@@ -245,12 +267,24 @@ def _get_input_nodes_dim(self, inputs, start_idx, end_idx, all_node_info):
                 remove_inputs.append(input_node)
             else:
                 inputs_dim.append(input_dict)
+        # remove unchunked inputs
         for i in remove_inputs:
             if i in inputs:
                 inputs.remove(i)
         return inputs, inputs_dim
 
-    def _get_prepose_nodes(self, all_node_info, start_idx, end_idx):
+    def _get_prepose_nodes(self, all_node_info: Dict, start_idx: int, end_idx: int) -> List[Node]:
+        """
+        get all useless nodes in chunk region and prepose them
+
+        Args:
+            all_node_info (Dict): describe all node's chunk dim and fix dim
+            start_idx (int): chunk start idx
+            end_idx (int): chunk end idx
+
+        Returns:
+            List[Node]: all nodes to be preposed
+        """
         # get all possible prepose nodes
         maybe_prepose_nodes = []
         for node, node_info in all_node_info.items():
@@ -276,7 +310,7 @@ def _get_prepose_nodes(self, all_node_info, start_idx, end_idx):
                 for cur_prepose_node in tmp_cur_prepose_nodes:
                     if prepose_flag == False:
                         break
-                    for cur_prepose_node_arg in cur_prepose_node.args:
+                    for cur_prepose_node_arg in cur_prepose_node.all_input_nodes:
                         if type(cur_prepose_node_arg) != type(cur_prepose_node):
                             continue
                         # out of loop
@@ -360,19 +394,28 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
         return chunk_info
 
     def _reassgin_reshape_size(self, chunk_info):
+        """
+        Some shape args in reshape may have changed due to chunk
+        reassgin those changed shape
+        """
         chunk_region = chunk_info["region"]
         reshape_size = {}
         chunk_shape = get_node_shape(chunk_info["outputs"][0])[chunk_info["outputs_dim"]]
         for node in self.trace_indice.node_list[chunk_region[0]:chunk_region[1] + 1]:
             if any(i in node.name for i in ["reshape", "view"]):
-                reshape_args = node.args[1:]
-                reshape_log = self.trace_indice.indice_view_list[node]
+                reshape_args = flat_list(node.args[1:])
                 chunk_dim = chunk_info["node_chunk_dim"][node]["chunk_dim"]
-                reshape_size[node.name] = {}
+                new_shape = ""
                 for reshape_arg_dim, reshape_arg in enumerate(reshape_args):
-                    if reshape_arg_dim in reshape_log["dim_to"]:
-                        continue
                     if reshape_arg_dim == chunk_dim:
-                        reshape_size[node.name][reshape_arg.name] = ("min(chunk_size, %d - chunk_idx)" % chunk_shape)
+                        new_shape += "min(chunk_size, %d - chunk_idx), " % chunk_shape
+                    else:
+                        if isinstance(reshape_arg, int):
+                            new_shape += "%s, " % str(reshape_arg)
+                        else:
+                            new_shape += "%s, " % reshape_arg.name
+                new_shape = new_shape[:-2]
+                origin_shape = str(reshape_args)[1:-1]
+                reshape_size[node.name] = [origin_shape, new_shape]
         chunk_info["reshape_size"] = reshape_size
         return chunk_info
diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 5a5d15e0a1f4..862cd6b99ccc 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -3,7 +3,7 @@
 
 from torch.fx.node import Node
 
-from .utils import find_first_tensor_arg, find_idx_by_name, get_node_shape, unflat_list
+from .utils import find_first_tensor_arg, find_idx_by_name, flat_list, get_node_shape
 
 
 class TraceIndice(object):
@@ -28,7 +28,7 @@ class TraceIndice(object):
         node_list (List)
     """
 
-    def __init__(self, node_list: List) -> None:
+    def __init__(self, node_list: List[Node]) -> None:
         self.node_list = node_list
         self.indice_trace_list = self._init_indice_trace_list()
         self.indice_view_list = {}
@@ -198,7 +198,7 @@ def _find_compute_trace_from_node(self, node):
         node_idx = find_idx_by_name(node.name, self.node_list)
         return self.indice_trace_list[node_idx]["compute"]
 
-    def _assign_indice_as_input(self, node, node_idx, input_node=None):
+    def _assign_indice_as_input(self, node: Node, node_idx: int, input_node=None):
         """
         Assign node's trace as its input node.
 
@@ -216,7 +216,7 @@ def _assign_indice_as_input(self, node, node_idx, input_node=None):
 
         self._inherit_all_computation(input_node, node)
 
-    def _assign_all_indice(self, node, node_idx):
+    def _assign_all_indice(self, node: Node, node_idx: int):
         """
         Add new indice for all node's dims.
 
@@ -232,7 +232,7 @@ def _assign_all_indice(self, node, node_idx):
             new_trace.append(self._add_indice())
         self.indice_trace_list[node_idx]["indice"] = new_trace
 
-    def _assign_transpose_indice(self, node, node_idx):
+    def _assign_transpose_indice(self, node: Node, node_idx: int):
         """
         Assign indice for transpose op.
         1. swap input's dim according to transpose args
@@ -249,7 +249,7 @@ def _assign_transpose_indice(self, node, node_idx):
         self._inherit_indice(input_node, tranpose_dim[1], node, tranpose_dim[0])
         self._inherit_indice(input_node, tranpose_dim[0], node, tranpose_dim[1])
 
-    def _assign_permute_indice(self, node, node_idx):
+    def _assign_permute_indice(self, node: Node, node_idx: int):
         """
         Assign indice for permute op.
         1. swap input's dim according to permute args
@@ -259,14 +259,14 @@ def _assign_permute_indice(self, node, node_idx):
             node (node)
             node_idx (int)
         """
-        permute_dim = unflat_list(node.args[1:])
+        permute_dim = flat_list(node.args[1:])
         input_node = node.args[0]
 
         self._assign_indice_as_input(node, node_idx, input_node)
         for idx, d in enumerate(permute_dim):
             self._inherit_indice(input_node, d, node, idx)
 
-    def _assign_linear_indice(self, node, node_idx):
+    def _assign_linear_indice(self, node: Node, node_idx: int):
         """
         Assign indice for linear op.
         1. copy trace from input node and change last indice accroding to weight
@@ -287,7 +287,7 @@ def _assign_linear_indice(self, node, node_idx):
 
         self._mark_computation(node, node_idx, [-1])
 
-    def _assign_matmul_indice(self, node, node_idx):
+    def _assign_matmul_indice(self, node: Node, node_idx: int):
         """
         Assign indice for matmul op.
         1. copy trace from matmul_left and change last indice accroding to matmul_right. (assert they have same length)
@@ -393,7 +393,7 @@ def _assign_softmax_indice(self, node, idx):
         self._assign_indice_as_input(node, idx)
         self._mark_computation(node, idx, [node.kwargs["dim"]])
 
-    def _assign_unsqueeze_indice(self, node, node_idx):
+    def _assign_unsqueeze_indice(self, node: Node, node_idx: int):
         """
         Assign indice for unsqueeze op.
         1. assign new indice for unsqueeze dim
@@ -404,9 +404,13 @@ def _assign_unsqueeze_indice(self, node, node_idx):
         """
         self._del_dim(node_idx, -1)
         self._assign_indice_as_input(node, node_idx)
-        self._add_dim(node_idx, node.args[1])
+        dim_idx = node.args[1]
+        # unsqueeze(-1) = unsqueeze(shape_num + 1)
+        if dim_idx < 0:
+            dim_idx = list(range(len(get_node_shape(node))))[dim_idx]
+        self._add_dim(node_idx, dim_idx)
 
-    def _assign_dropout_indice(self, node, node_idx):
+    def _assign_dropout_indice(self, node: Node, node_idx: int):
         """
         Assign indice for unsqueeze op.
         1. assign new indice for unsqueeze dim
@@ -417,7 +421,7 @@ def _assign_dropout_indice(self, node, node_idx):
         """
         self._assign_indice_as_input(node, node_idx)
 
-    def _assign_ones_like_indice(self, node, node_idx):
+    def _assign_ones_like_indice(self, node: Node, node_idx: int):
         """
         Assign indice for oneslike op.
         1. assign new indice for all dim
@@ -428,7 +432,47 @@ def _assign_ones_like_indice(self, node, node_idx):
         """
         self._assign_all_indice(node, node_idx)
 
-    def _assign_view_reshape_indice(self, node, node_idx):
+    def _assign_getitem_indice(self, node: Node, node_idx: int):
+        """
+        Assign indice for getitem.
+        getitem can act like slice sometimes
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        node_args = flat_list(node.args[1:])
+        if not any(i == str(node_arg) for i in ["None", "Ellipsis"] for node_arg in node_args):
+            return
+
+        # node args should be like [Ellipsis, slice(start, step, end), None]
+        node_shape = get_node_shape(node)
+        origin_idx_count = 0
+        new_idx_count = 0
+        new_dim_num = sum([1 if str(i) == "None" else 0 for i in node_args])
+        for _ in range(new_dim_num):
+            self._del_dim(node_idx, 0)
+        self._assign_indice_as_input(node, node_idx)
+
+        for _, node_arg in enumerate(node_args):
+            node_arg_str = str(node_arg)
+            # Ellipsis means [..., ]
+            if "Ellipsis" == node_arg_str:
+                shape_gap = len(node_shape) - len(node_args) + 1
+                origin_idx_count += shape_gap
+                new_idx_count += shape_gap
+            # slice(None, None, None) means all indexes, doesn't support other slice
+            elif "slice(None, None, None)" == node_arg_str:
+                origin_idx_count += 1
+                new_idx_count += 1
+            # None means a new dim
+            elif "None" == node_arg_str:
+                self._add_dim(node_idx, new_idx_count)
+                new_idx_count += 1
+            else:
+                raise NotImplementedError()
+
+    def _assign_view_reshape_indice(self, node: Node, node_idx: int):
         """
         Assign indice for view and reshape op.
         1. get origin shape and target shape by meta info.
@@ -447,7 +491,7 @@ def _assign_view_reshape_indice(self, node, node_idx):
         origin_node = node.args[0]
         origin_shape = origin_node.meta["tensor_meta"].shape
         target_shape = []
-        unflated_args = unflat_list(node.args)
+        unflated_args = flat_list(node.args)
         for i in range(1, len(unflated_args)):
             if isinstance(unflated_args[i], int):
                 target_shape.append(unflated_args[i])
@@ -544,6 +588,8 @@ def trace_indice(self):
                     self._assign_einsum_indice(node, idx)
                 elif "layer_norm" in node.name:
                     self._assign_layernorm_indice(node, idx)
+                elif "getitem" in node.name:
+                    self._assign_getitem_indice(node, idx)
                 elif any(i in node.name for i in ["getattr", "getitem", "eq", "_assert"]):
                     continue
                 else:
diff --git a/colossalai/autochunk/utils.py b/colossalai/autochunk/utils.py
index 5f3ea3bf482d..9c2363b544e2 100644
--- a/colossalai/autochunk/utils.py
+++ b/colossalai/autochunk/utils.py
@@ -3,14 +3,14 @@
 from torch.fx.node import Node
 
 
-def unflat_list(inputs):
+def flat_list(inputs):
     """
-    unflat a list by recursion
+    flat a list by recursion
     """
     res = []
     for i in inputs:
         if isinstance(i, list) or isinstance(i, set) or isinstance(i, tuple):
-            res.extend(unflat_list(i))
+            res.extend(flat_list(i))
         else:
             res.append(i)
     return res
@@ -27,8 +27,13 @@ def find_first_tensor_arg(node):
 
 
 def is_non_compute_node(node):
-    if any(i in node.op for i in ["placeholder", "get_attr", "output"]) or any(
-            i in node.name for i in ["getitem", "getattr"]):
+    if any(i in node.op for i in ["placeholder", "get_attr", "output"]) or any(i in node.name for i in ["getattr"]):
+        return True
+    if "getitem" in node.name:
+        node_args = flat_list(node.args[1:])
+        for node_arg in node_args:
+            if any(i == str(node_arg) for i in ["None", "Ellipsis"]):
+                return False
         return True
     return False
 
@@ -40,15 +45,15 @@ def get_node_shape(node):
 
 
 def is_non_compute_node_except_placeholder(node):
-    if any(i in node.op for i in ["get_attr", "output"]) or any(i in node.name for i in ["getitem", "getattr"]):
-        return True
-    return False
+    if "placeholder" in node.op:
+        return False
+    return is_non_compute_node(node)
 
 
 def is_non_compute_node_except_placeholder_output(node):
-    if any(i in node.op for i in ["get_attr"]) or any(i in node.name for i in ["getitem", "getattr"]):
-        return True
-    return False
+    if "output" in node.op:
+        return False
+    return is_non_compute_node_except_placeholder(node)
 
 
 def find_idx_by_name(name, nodes_list):
diff --git a/tests/test_autochunk/test_evoformer_codegen.py b/tests/test_autochunk/test_evoformer_codegen.py
index 1273bf2fecbf..c5a893eda7cc 100644
--- a/tests/test_autochunk/test_evoformer_codegen.py
+++ b/tests/test_autochunk/test_evoformer_codegen.py
@@ -27,18 +27,17 @@
 
 def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair, node_mask, pair_mask):
     # for memory test
+    # model = model.cuda()
     # torch.cuda.reset_peak_memory_stats()
     # now_mem = torch.cuda.memory_allocated() / 1024**2
     # with torch.no_grad():
     #     node1 = node.clone()
     #     pair1 = pair.clone()
-    #     gm(node1, pair1)
-    # new_now_mem = torch.cuda.memory_allocated() / 1024**2
+    #     node_mask1 = node_mask.clone()
+    #     pair_mask1 = pair_mask.clone()
+    #     gm(node1, pair1, node_mask1, pair_mask1)
     # new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
-    # print(
-    #     "autochunk now mem:%.2f max mem:%.2f"
-    #     % (new_now_mem - now_mem, new_max_mem - now_mem)
-    # )
+    # print("autochunk max mem:%.2f"% (new_max_mem - now_mem))
 
     # test forward
     model = model.cuda()
@@ -113,7 +112,7 @@ def _test_evoformer_codegen(rank, msa_len, pair_len, max_memory):
         MetaTensor(node_mask, fake_device="cuda:0"),
         MetaTensor(pair_mask, fake_device="cuda:0"),
     )
-    # codegen = AutoChunkCodeGen(meta_graph, max_memory=max_memory)
+    codegen = AutoChunkCodeGen(meta_graph, max_memory=max_memory, print_mem=False)
 
     # trace and recompile
     # MetaInfoProp requires symbolic_trace but CodeGen requires ColoTracer
@@ -130,14 +129,14 @@ def _test_evoformer_codegen(rank, msa_len, pair_len, max_memory):
             "_mask_trans": True,
         },
     )
-    # graph.set_codegen(codegen)
+    graph.set_codegen(codegen)
     gm = ColoGraphModule(model, graph)
     gm.recompile()
 
     # assert we have inserted chunk
     code = graph.python_code("self").src
-    assert "chunk_size" in code
     # print(code)
+    assert "chunk_result = None;  chunk_size = None;" in code
 
     _test_fwd(model, gm, node, pair, node_mask, pair_mask)
     gpc.destroy()
@@ -147,7 +146,7 @@ def _test_evoformer_codegen(rank, msa_len, pair_len, max_memory):
     not (CODEGEN_AVAILABLE and is_compatible_with_meta() and HAS_REPO),
     reason="torch version is lower than 1.12.0",
 )
-@pytest.mark.parametrize("max_memory", [None, 20, 25, 30])
+@pytest.mark.parametrize("max_memory", [None, 24, 28, 32])
 @pytest.mark.parametrize("msa_len", [32])
 @pytest.mark.parametrize("pair_len", [64])
 def test_evoformer_codegen(msa_len, pair_len, max_memory):
@@ -161,4 +160,4 @@ def test_evoformer_codegen(msa_len, pair_len, max_memory):
 
 
 if __name__ == "__main__":
-    _test_evoformer_codegen(0, 32, 64, 25)
+    _test_evoformer_codegen(0, 32, 64, 24)
diff --git a/tests/test_autochunk/test_simple_evoformer_codegen.py b/tests/test_autochunk/test_simple_evoformer_codegen.py
index f1272330fcd9..8ab77024c1b9 100644
--- a/tests/test_autochunk/test_simple_evoformer_codegen.py
+++ b/tests/test_autochunk/test_simple_evoformer_codegen.py
@@ -13,7 +13,7 @@
 
 import colossalai
 from colossalai.core import global_context as gpc
-from colossalai.fx import ColoTracer
+from colossalai.fx import ColoTracer, symbolic_trace
 from colossalai.fx._compatibility import is_compatible_with_meta
 from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
 from colossalai.fx.graph_module import ColoGraphModule
@@ -26,21 +26,6 @@
 
 
 def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair):
-    # for memory test
-    # torch.cuda.reset_peak_memory_stats()
-    # now_mem = torch.cuda.memory_allocated() / 1024**2
-    # with torch.no_grad():
-    #     node1 = node.clone()
-    #     pair1 = pair.clone()
-    #     gm(node1, pair1)
-    # new_now_mem = torch.cuda.memory_allocated() / 1024**2
-    # new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
-    # print(
-    #     "autochunk now mem:%.2f max mem:%.2f"
-    #     % (new_now_mem - now_mem, new_max_mem - now_mem)
-    # )
-
-    # test forward
     with torch.no_grad():
         non_fx_out = model(node, pair)
         fx_out = gm(node, pair)
@@ -69,6 +54,16 @@ def _test_simple_evoformer_codegen(rank, msa_len, pair_len, max_memory):
     node = torch.randn(1, msa_len, pair_len, 256).cuda()
     pair = torch.randn(1, pair_len, pair_len, 128).cuda()
 
+    # meta info prop
+    meta_graph = symbolic_trace(model,
+                                meta_args={
+                                    "node": node.to(torch.device("meta")),
+                                    "pair": pair.to(torch.device("meta")),
+                                })    # must use symbolic_trace
+    interp = MetaInfoProp(meta_graph)
+    interp.propagate(MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0"))
+    codegen = AutoChunkCodeGen(meta_graph, max_memory=max_memory)
+
     # trace the module and replace codegen
     graph = ColoTracer().trace(
         model,
@@ -77,24 +72,14 @@ def _test_simple_evoformer_codegen(rank, msa_len, pair_len, max_memory):
             "pair": pair.to(torch.device("meta")),
         },
     )
-    gm_prop = torch.fx.symbolic_trace(model)    # must use symbolic_trace
-    interp = MetaInfoProp(gm_prop)
-    interp.propagate(MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0"))
-
-    # now run it twice to get meta info in graph module, not necessary
-    gm = torch.fx.GraphModule(model, graph)
-    interp = MetaInfoProp(gm)
-    interp.propagate(MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0"))
-
-    codegen = AutoChunkCodeGen(gm_prop, max_memory=max_memory)
     graph.set_codegen(codegen)
     gm = ColoGraphModule(model, graph)
     gm.recompile()
 
     # assert we have inserted chunk
     code = graph.python_code("self").src
-    assert "chunk_size" in code
     # print(code)
+    assert "chunk_result = None;  chunk_size = None;" in code
 
     _test_fwd(model, gm, node, pair)
     gpc.destroy()
diff --git a/tests/test_autochunk/test_simple_evoformer_search.py b/tests/test_autochunk/test_simple_evoformer_search.py
index 04fb514fbf44..4c591c48319e 100644
--- a/tests/test_autochunk/test_simple_evoformer_search.py
+++ b/tests/test_autochunk/test_simple_evoformer_search.py
@@ -47,18 +47,18 @@ def assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len):
             str(target_regions),
         )
     for region in target_regions:
-        assert (region in found_regions), "region:%s not in found regions for msa:%d, pair:%d, maxmem:%d" % (
+        assert (region in found_regions), "region:%s not in found regions for msa:%d, pair:%d, maxmem:%s" % (
             str(region),
             msa_len,
             pair_len,
-            max_memory,
+            str(max_memory),
         )
     for region in found_regions:
         assert (region in target_regions), "region:%s should not be found for msa:%d, pair:%d, maxmem:%d" % (
             str(region),
             msa_len,
             pair_len,
-            max_memory,
+            str(max_memory),
         )
 
 
From 99d9713b02664a51861e8ece23b974f5428c4f3e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E3=82=A2=E3=83=9E=E3=83=87=E3=82=A6=E3=82=B9?=
 <kurisusnowdeng@users.noreply.github.com>
Date: Thu, 19 Jan 2023 12:23:03 +0800
Subject: [PATCH 202/209] Revert "Update parallel_context.py (#2408)"

This reverts commit 7d5640b9db01b501e95b66e91be9fe27b58d2e58.
---
 colossalai/context/parallel_context.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/context/parallel_context.py b/colossalai/context/parallel_context.py
index b7338b53ddde..dd12dad6d347 100644
--- a/colossalai/context/parallel_context.py
+++ b/colossalai/context/parallel_context.py
@@ -375,7 +375,7 @@ def init_global_dist(self, rank: int, world_size: int, backend: str, host: str,
 
         # None will give the default global process group for pytorch dist operations
         ranks = list(range(world_size))
-        cpu_group = dist.new_group(ranks, backend='gloo') if dist.get_backend() == 'gloo' else None
+        cpu_group = dist.new_group(ranks, backend='gloo') if dist.get_backend() != 'gloo' else None
         self._register_dist(rank, world_size, dist.GroupMember.WORLD, cpu_group, ranks, ParallelMode.GLOBAL)
         self.add_global_rank(ParallelMode.GLOBAL, rank)
 

From 0f02b8c6e67e565e41fe2546179209bb63dcd4a9 Mon Sep 17 00:00:00 2001
From: Ziyue Jiang <ziyue.jiang97@gmail.com>
Date: Thu, 19 Jan 2023 13:54:50 +0800
Subject: [PATCH 203/209] add avg partition (#2483)

Co-authored-by: Ziyue Jiang <ziyue.jiang@gmail.com>
---
 .../fx/passes/adding_split_node_pass.py       | 36 +++++++++++++++++++
 colossalai/fx/passes/meta_info_prop.py        |  3 +-
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/colossalai/fx/passes/adding_split_node_pass.py b/colossalai/fx/passes/adding_split_node_pass.py
index 373d20c51041..0499769d884d 100644
--- a/colossalai/fx/passes/adding_split_node_pass.py
+++ b/colossalai/fx/passes/adding_split_node_pass.py
@@ -9,6 +9,40 @@ def pipe_split():
     pass
 
 
+def avgcompute_split_pass(gm: torch.fx.GraphModule, pp_size: int):
+    """
+    In avgcompute_split_pass, we split module by the fwd flops.
+    """
+    mod_graph = gm.graph
+    # To use avgcompute_split_pass, we need run meta_info_prop interpreter first.
+    # If nodes don't have meta info, this pass will fall back to normal balanced split pass.
+    check_node = list(mod_graph.nodes)[0]
+    if 'tensor_meta' not in check_node.meta:
+        return balanced_split_pass(gm, pp_size)
+
+    total_fwd_flop = 0
+    for node in mod_graph.nodes:
+        total_fwd_flop += node.fwd_flop
+
+    partition_flop = total_fwd_flop // pp_size
+    accumulate_fwd_flop = 0
+    for node in mod_graph.nodes:
+        if pp_size <= 1:
+            break
+        if 'pipe_split' in node.name:
+            continue
+        accumulate_fwd_flop += node.fwd_flop
+        if accumulate_fwd_flop >= partition_flop:
+            total_fwd_flop = total_fwd_flop - accumulate_fwd_flop
+            accumulate_fwd_flop = 0
+            pp_size -= 1
+            partition_flop = total_fwd_flop // pp_size
+            with mod_graph.inserting_after(node):
+                split_node = mod_graph.create_node('call_function', pipe_split)
+    gm.recompile()
+    return gm
+
+
 def avgnode_split_pass(gm: torch.fx.GraphModule, pp_size: int):
     """
     In avgnode_split_pass, simpliy split graph by node number.
@@ -104,8 +138,10 @@ def balanced_split_pass_v2(gm: torch.fx.GraphModule, pp_size: int):
             continue
         accumulate_node_size += node.node_size
         if accumulate_node_size >= partition_size:
+            total_element_size = total_element_size - accumulate_node_size
             accumulate_node_size = 0
             pp_size -= 1
+            partition_size = total_element_size // pp_size
             with mod_graph.inserting_after(node):
                 split_node = mod_graph.create_node('call_function', pipe_split)
     gm.recompile()
diff --git a/colossalai/fx/passes/meta_info_prop.py b/colossalai/fx/passes/meta_info_prop.py
index 5137494ada6f..281cae41f77d 100644
--- a/colossalai/fx/passes/meta_info_prop.py
+++ b/colossalai/fx/passes/meta_info_prop.py
@@ -112,7 +112,8 @@ def extract_tensor_meta(obj):
         n.meta['tensor_meta'] = tensor_meta
         n.meta = {**n.meta, **asdict(meta_info)}    # extend MetaInfo to `n.meta`
         # TODO: the attribute node_size should be removed in the future
-        setattr(n, 'node_size', activation_size(n.meta.get('fwd_in', 0)) + activation_size(n.meta.get('fwd_tmp', 0)))
+        setattr(n, 'node_size', activation_size(n.meta.get('fwd_out', 0)) + activation_size(n.meta.get('fwd_tmp', 0)))
+        setattr(n, 'fwd_flop', n.meta.get('fwd_flop', 0))
         n.meta['type'] = type(result)
 
         # retain the autograd graph

From 72341e65f4fbeb1884d9cd1ce3d1996ae8642bc8 Mon Sep 17 00:00:00 2001
From: oahzxl <43881818+oahzxl@users.noreply.github.com>
Date: Fri, 20 Jan 2023 10:13:03 +0800
Subject: [PATCH 204/209] [auto-chunk] support extramsa (#3) (#2504)

---
 colossalai/autochunk/estimate_memory.py       |   9 +-
 colossalai/autochunk/trace_flow.py            |  43 +++--
 colossalai/autochunk/trace_indice.py          |  56 +++++-
 colossalai/autochunk/utils.py                 |  20 ++-
 .../test_autochunk/test_evoformer_codegen.py  |   2 +-
 tests/test_autochunk/test_extramsa_codegen.py | 164 ++++++++++++++++++
 .../test_simple_evoformer_codegen.py          |   2 +-
 .../test_simple_evoformer_search.py           |  41 ++---
 8 files changed, 283 insertions(+), 54 deletions(-)
 create mode 100644 tests/test_autochunk/test_extramsa_codegen.py

diff --git a/colossalai/autochunk/estimate_memory.py b/colossalai/autochunk/estimate_memory.py
index d386253850a7..21f34481ba70 100644
--- a/colossalai/autochunk/estimate_memory.py
+++ b/colossalai/autochunk/estimate_memory.py
@@ -6,12 +6,7 @@
 
 from colossalai.fx.profiler import activation_size, parameter_size
 
-from .utils import (
-    delete_free_var_from_last_use,
-    find_idx_by_name,
-    get_node_shape,
-    is_non_compute_node_except_placeholder,
-)
+from .utils import delete_free_var_from_last_use, find_idx_by_name, get_node_shape, is_non_memory_node
 
 
 class EstimateMemory(object):
@@ -240,7 +235,7 @@ def estimate_chunk_inference_mem(
             elif node.op == "output":
                 continue
             # no change for non compute node
-            elif is_non_compute_node_except_placeholder(node):
+            elif is_non_memory_node(node):
                 act_memory_peak_log.append(act_memory)
             # node is a compute op
             # calculate tmp, output node and delete node memory
diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py
index 04fa2b3bb480..e657c188ead2 100644
--- a/colossalai/autochunk/trace_flow.py
+++ b/colossalai/autochunk/trace_flow.py
@@ -118,16 +118,34 @@ def check_index_duplicate(self, chunk_infos, return_dim=False):
 
     def _assgin_single_node_flow(
         self,
-        arg_node,
-        start_idx,
-        end_idx,
-        cur_node_dim,
-        cur_node_compute,
-        cur_node_source,
-        cur_node_fix_dim,
-        all_node_info,
-        next_node_list,
-    ):
+        arg_node: Node,
+        start_idx: int,
+        end_idx: int,
+        cur_node_dim: int,
+        cur_node_compute: Dict,
+        cur_node_source: Dict,
+        cur_node_fix_dim: List,
+        all_node_info: Dict,
+        next_node_list: List,
+    ) -> bool:
+        """
+        Given the current node and one of its arg node,
+        this function finds out arg node's chunk dim and fix dim
+
+        Args:
+            arg_node (Node): input node
+            start_idx (int): chunk region start
+            end_idx (int): chunk region end
+            cur_node_dim (int): current node chunk dim
+            cur_node_compute (Dict): current node compute dict
+            cur_node_source (Dict): current node source dict
+            cur_node_fix_dim (List): current node fix dim
+            all_node_info (Dict): all node chunk info in the chunk region
+            next_node_list (List)
+
+        Returns:
+            bool: True if this node can be added to the flow, vice versa.
+        """
         arg_idx = find_idx_by_name(arg_node.name, self.trace_indice.node_list)
         # arg in chunk range or be inputs
         if not (start_idx <= arg_idx < end_idx):
@@ -142,6 +160,9 @@ def _assgin_single_node_flow(
                 arg_dim = None
             else:
                 arg_dim = cur_node_source[cur_node_dim][arg_idx][0]
+                # chunk dim should be None if shape size is 1
+                if get_node_shape(arg_node)[arg_dim] == 1:
+                    arg_dim = None
         else:
             arg_dim = None
 
@@ -184,7 +205,7 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx):
 
                 # get all valid args
                 arg_list = []
-                for arg in cur_node.args:
+                for arg in cur_node.all_input_nodes:
                     if type(arg) != type(cur_node):
                         continue
                     if is_non_compute_node(arg):
diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 862cd6b99ccc..5c2e9b5203b5 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -432,6 +432,38 @@ def _assign_ones_like_indice(self, node: Node, node_idx: int):
         """
         self._assign_all_indice(node, node_idx)
 
+    def _assign_cat_indice(self, node: Node, node_idx: int):
+        """
+        Assign indice for cat op.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        nodes_in = flat_list(node.args[0])
+        self._assign_indice_as_input(node, node_idx, input_node=nodes_in[0])
+        for n in nodes_in[1:]:
+            self._mark_computation_from_node(n, node)
+        cat_dim = node.kwargs["dim"]
+        self._del_dim(node_idx, cat_dim)
+        self._add_dim(node_idx, cat_dim)
+
+    def _assign_sum_indice(self, node: Node, node_idx: int):
+        """
+        Assign indice for sum op.
+
+        Args:
+            node (node)
+            node_idx (int)
+        """
+        nodes_in = flat_list(node.args[0])
+        self._add_dim(node_idx, 0)
+        self._assign_indice_as_input(node, node_idx, input_node=nodes_in[0])
+        for n in nodes_in[1:]:
+            self._mark_computation_from_node(n, node)
+        cat_dim = node.kwargs["dim"]
+        self._del_dim(node_idx, cat_dim)
+
     def _assign_getitem_indice(self, node: Node, node_idx: int):
         """
         Assign indice for getitem.
@@ -442,7 +474,16 @@ def _assign_getitem_indice(self, node: Node, node_idx: int):
             node_idx (int)
         """
         node_args = flat_list(node.args[1:])
-        if not any(i == str(node_arg) for i in ["None", "Ellipsis"] for node_arg in node_args):
+        flag = False
+        for node_arg in node_args:
+            node_arg_str = str(node_arg)
+            if any(i == node_arg_str for i in ["None", "Ellipsis"]):
+                flag = True
+                break
+            if "slice" in node_arg_str:
+                flag = True
+                break
+        if flag == False:
             return
 
         # node args should be like [Ellipsis, slice(start, step, end), None]
@@ -461,8 +502,11 @@ def _assign_getitem_indice(self, node: Node, node_idx: int):
                 shape_gap = len(node_shape) - len(node_args) + 1
                 origin_idx_count += shape_gap
                 new_idx_count += shape_gap
-            # slice(None, None, None) means all indexes, doesn't support other slice
-            elif "slice(None, None, None)" == node_arg_str:
+            # slice(None, None, None) means all indexes
+            elif "slice" in node_arg_str:
+                if "slice(None, None, None)" != node_arg_str:
+                    self._del_dim(node_idx, new_idx_count)
+                    self._add_dim(node_idx, new_idx_count)
                 origin_idx_count += 1
                 new_idx_count += 1
             # None means a new dim
@@ -565,7 +609,7 @@ def trace_indice(self):
                     self._assign_view_reshape_indice(node, idx)
                 elif "unsqueeze" in node.name:
                     self._assign_unsqueeze_indice(node, idx)
-                elif any(i in node.name for i in ["to", "contiguous"]):
+                elif any(i in node.name for i in ["to", "contiguous", "clone"]):
                     self._assgin_no_change_indice(node, idx)
                 elif "new_ones" in node.name:
                     self._assign_ones_like_indice(node, idx)
@@ -574,6 +618,8 @@ def trace_indice(self):
             elif node.op == "call_function":
                 if "linear" in node.name:
                     self._assign_linear_indice(node, idx)
+                elif "cat" in node.name:
+                    self._assign_cat_indice(node, idx)
                 elif "matmul" in node.name:
                     self._assign_matmul_indice(node, idx)
                 elif "softmax" in node.name:
@@ -586,6 +632,8 @@ def trace_indice(self):
                     self._assign_dropout_indice(node, idx)
                 elif "einsum" in node.name:
                     self._assign_einsum_indice(node, idx)
+                elif "sum" in node.name:
+                    self._assign_sum_indice(node, idx)
                 elif "layer_norm" in node.name:
                     self._assign_layernorm_indice(node, idx)
                 elif "getitem" in node.name:
diff --git a/colossalai/autochunk/utils.py b/colossalai/autochunk/utils.py
index 9c2363b544e2..ff1a64bc359d 100644
--- a/colossalai/autochunk/utils.py
+++ b/colossalai/autochunk/utils.py
@@ -3,10 +3,12 @@
 from torch.fx.node import Node
 
 
-def flat_list(inputs):
+def flat_list(inputs: Any) -> List:
     """
     flat a list by recursion
     """
+    if not (isinstance(inputs, list) or isinstance(inputs, set) or isinstance(inputs, tuple)):
+        return [inputs]
     res = []
     for i in inputs:
         if isinstance(i, list) or isinstance(i, set) or isinstance(i, tuple):
@@ -16,7 +18,7 @@ def flat_list(inputs):
     return res
 
 
-def find_first_tensor_arg(node):
+def find_first_tensor_arg(node: Node) -> Node:
     """
     Find the first input tensor arg for a node
     """
@@ -26,7 +28,7 @@ def find_first_tensor_arg(node):
     raise RuntimeError()
 
 
-def is_non_compute_node(node):
+def is_non_compute_node(node: Node) -> bool:
     if any(i in node.op for i in ["placeholder", "get_attr", "output"]) or any(i in node.name for i in ["getattr"]):
         return True
     if "getitem" in node.name:
@@ -34,16 +36,26 @@ def is_non_compute_node(node):
         for node_arg in node_args:
             if any(i == str(node_arg) for i in ["None", "Ellipsis"]):
                 return False
+            if "slice" in str(node_arg):
+                return False
         return True
     return False
 
 
-def get_node_shape(node):
+def get_node_shape(node: Node) -> List:
     if hasattr(node.meta["tensor_meta"], "shape"):
         return node.meta["tensor_meta"].shape
     return None
 
 
+def is_non_memory_node(node: Node) -> bool:
+    if "getitem" in node.name:
+        return True
+    if "output" in node.op:
+        return True
+    return is_non_compute_node(node)
+
+
 def is_non_compute_node_except_placeholder(node):
     if "placeholder" in node.op:
         return False
diff --git a/tests/test_autochunk/test_evoformer_codegen.py b/tests/test_autochunk/test_evoformer_codegen.py
index c5a893eda7cc..ba6a57a51ce3 100644
--- a/tests/test_autochunk/test_evoformer_codegen.py
+++ b/tests/test_autochunk/test_evoformer_codegen.py
@@ -130,7 +130,7 @@ def _test_evoformer_codegen(rank, msa_len, pair_len, max_memory):
         },
     )
     graph.set_codegen(codegen)
-    gm = ColoGraphModule(model, graph)
+    gm = ColoGraphModule(model, graph, ckpt_codegen=False)
     gm.recompile()
 
     # assert we have inserted chunk
diff --git a/tests/test_autochunk/test_extramsa_codegen.py b/tests/test_autochunk/test_extramsa_codegen.py
new file mode 100644
index 000000000000..2a41452a2ad7
--- /dev/null
+++ b/tests/test_autochunk/test_extramsa_codegen.py
@@ -0,0 +1,164 @@
+from functools import partial
+
+import pytest
+import torch
+import torch.fx
+import torch.multiprocessing as mp
+
+try:
+    from fastfold.model.nn.evoformer import ExtraMSABlock
+    HAS_REPO = True
+except:
+    HAS_REPO = False
+
+import colossalai
+from colossalai.core import global_context as gpc
+from colossalai.fx._compatibility import is_compatible_with_meta
+from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
+from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp
+from colossalai.utils import free_port
+
+if CODEGEN_AVAILABLE and is_compatible_with_meta():
+    from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
+    from colossalai.fx.profiler import MetaTensor
+    from colossalai.fx.tracer.experimental import ColoTracer, symbolic_trace
+
+
+def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair, node_mask, pair_mask):
+    # for memory test
+    # model = model.cuda()
+    # torch.cuda.reset_peak_memory_stats()
+    # now_mem = torch.cuda.memory_allocated() / 1024**2
+    # with torch.no_grad():
+    #     node1 = node.clone()
+    #     pair1 = pair.clone()
+    #     node_mask1 = node_mask.clone()
+    #     pair_mask1 = pair_mask.clone()
+    #     gm(node1, pair1, node_mask1, pair_mask1)
+    # new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
+    # print("autochunk max mem:%.2f"% (new_max_mem - now_mem))
+
+    # test forward
+    model = model.cuda()
+    with torch.no_grad():
+        non_fx_out = model(node, pair, node_mask, pair_mask)
+        fx_out = gm(node, pair, node_mask, pair_mask)
+
+    assert torch.allclose(non_fx_out[0], fx_out[0],
+                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
+                              torch.abs(non_fx_out[0] - fx_out[0]))
+    assert torch.allclose(non_fx_out[1], fx_out[1],
+                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
+                              torch.abs(non_fx_out[1] - fx_out[1]))
+
+
+def _build_openfold():
+    model = ExtraMSABlock(
+        c_m=256,
+        c_z=128,
+        c_hidden_msa_att=32,
+        c_hidden_opm=32,
+        c_hidden_mul=128,
+        c_hidden_pair_att=32,
+        no_heads_msa=8,
+        no_heads_pair=4,
+        transition_n=4,
+        msa_dropout=0.15,
+        pair_dropout=0.15,
+        inf=1e4,
+        eps=1e-4,
+        ckpt=False,
+        is_multimer=False,
+    ).eval().cuda()
+    return model
+
+
+def _test_extramsa_codegen(rank, msa_len, pair_len, max_memory):
+    # launch colossalai
+    colossalai.launch(
+        config={},
+        rank=rank,
+        world_size=1,
+        host="localhost",
+        port=free_port(),
+        backend="nccl",
+    )
+
+    # build model and input
+    model = _build_openfold()
+    node = torch.randn(1, msa_len, pair_len, 256).cuda()
+    node_mask = torch.randn(1, msa_len, pair_len).cuda()
+    pair = torch.randn(1, pair_len, pair_len, 128).cuda()
+    pair_mask = torch.randn(1, pair_len, pair_len).cuda()
+
+    # trace the meta graph and setup codegen
+    meta_graph = symbolic_trace(
+        model,
+        meta_args={
+            "m": node.to(torch.device("meta")),
+            "z": pair.to(torch.device("meta")),
+            "msa_mask": node_mask.to(torch.device("meta")),
+            "pair_mask": pair_mask.to(torch.device("meta")),
+        },
+        concrete_args={
+            "chunk_size": None,
+            "_chunk_logits": 1024,
+        },
+    )
+    interp = MetaInfoProp(meta_graph)
+    interp.propagate(
+        MetaTensor(node, fake_device="cuda:0"),
+        MetaTensor(pair, fake_device="cuda:0"),
+        MetaTensor(node_mask, fake_device="cuda:0"),
+        MetaTensor(pair_mask, fake_device="cuda:0"),
+    )
+    codegen = AutoChunkCodeGen(meta_graph, max_memory=max_memory, print_mem=False)
+
+    # trace and recompile
+    # MetaInfoProp requires symbolic_trace but CodeGen requires ColoTracer
+    graph = ColoTracer().trace(
+        model,
+        meta_args={
+            "m": node.to(torch.device("meta")),
+            "z": pair.to(torch.device("meta")),
+            "msa_mask": node_mask.to(torch.device("meta")),
+            "pair_mask": pair_mask.to(torch.device("meta")),
+        },
+        concrete_args={
+            "chunk_size": None,
+            "_chunk_logits": 1024,
+        },
+    )
+    graph.set_codegen(codegen)
+    gm = ColoGraphModule(model, graph, ckpt_codegen=False)
+    gm.recompile()
+
+    # assert we have inserted chunk
+    code = graph.python_code("self").src
+    # print(code)
+    assert "chunk_result = None;  chunk_size = None;" in code
+
+    _test_fwd(model, gm, node, pair, node_mask, pair_mask)
+    gpc.destroy()
+
+
+@pytest.mark.skipif(
+    not (CODEGEN_AVAILABLE and is_compatible_with_meta() and HAS_REPO),
+    reason="torch version is lower than 1.12.0",
+)
+@pytest.mark.parametrize("max_memory", [None, 24, 28, 32])
+@pytest.mark.parametrize("msa_len", [32])
+@pytest.mark.parametrize("pair_len", [64])
+def test_extramsa_codegen(msa_len, pair_len, max_memory):
+    run_func = partial(
+        _test_extramsa_codegen,
+        msa_len=msa_len,
+        pair_len=pair_len,
+        max_memory=max_memory,
+    )
+    mp.spawn(run_func, nprocs=1)
+
+
+if __name__ == "__main__":
+    _test_extramsa_codegen(0, 32, 64, None)
diff --git a/tests/test_autochunk/test_simple_evoformer_codegen.py b/tests/test_autochunk/test_simple_evoformer_codegen.py
index 8ab77024c1b9..7fe149c5784d 100644
--- a/tests/test_autochunk/test_simple_evoformer_codegen.py
+++ b/tests/test_autochunk/test_simple_evoformer_codegen.py
@@ -73,7 +73,7 @@ def _test_simple_evoformer_codegen(rank, msa_len, pair_len, max_memory):
         },
     )
     graph.set_codegen(codegen)
-    gm = ColoGraphModule(model, graph)
+    gm = ColoGraphModule(model, graph, ckpt_codegen=False)
     gm.recompile()
 
     # assert we have inserted chunk
diff --git a/tests/test_autochunk/test_simple_evoformer_search.py b/tests/test_autochunk/test_simple_evoformer_search.py
index 4c591c48319e..89f28d625cbe 100644
--- a/tests/test_autochunk/test_simple_evoformer_search.py
+++ b/tests/test_autochunk/test_simple_evoformer_search.py
@@ -13,6 +13,7 @@
 
 import colossalai
 from colossalai.core import global_context as gpc
+from colossalai.fx import symbolic_trace
 from colossalai.fx._compatibility import is_compatible_with_meta
 from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
@@ -28,10 +29,10 @@ def assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len):
 
     if msa_len == 32 and pair_len == 64:
         if max_memory is None:
-            target_regions = [(142, 154), (366, 373), (233, 283), (301, 351), (127, 134), (204, 228), (167, 191),
-                              (161, 166), (198, 203), (6, 69)]
+            target_regions = [(142, 154), (366, 373), (234, 283), (302, 351), (127, 134), (211, 228), (174, 191),
+                              (161, 166), (198, 203), (7, 57)]
         elif max_memory == 20:
-            target_regions = [(142, 154), (369, 373), (233, 269), (301, 351)]
+            target_regions = [(142, 154), (369, 373), (235, 269), (303, 351), (130, 131)]
         elif max_memory == 25:
             target_regions = [(144, 154), (369, 370)]
         elif max_memory == 30:
@@ -41,25 +42,10 @@ def assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len):
     else:
         raise NotImplementedError()
 
-    assert len(found_regions) == len(
-        target_regions), "len of found regions %s doesn't equal len of target regions %s" % (
-            str(found_regions),
-            str(target_regions),
-        )
-    for region in target_regions:
-        assert (region in found_regions), "region:%s not in found regions for msa:%d, pair:%d, maxmem:%s" % (
-            str(region),
-            msa_len,
-            pair_len,
-            str(max_memory),
-        )
-    for region in found_regions:
-        assert (region in target_regions), "region:%s should not be found for msa:%d, pair:%d, maxmem:%d" % (
-            str(region),
-            msa_len,
-            pair_len,
-            str(max_memory),
-        )
+    assert found_regions == target_regions, "found regions %s doesn't equal target regions %s" % (
+        str(found_regions),
+        str(target_regions),
+    )
 
 
 def _test_simple_evoformer_search(rank, msa_len, pair_len, max_memory):
@@ -78,11 +64,14 @@ def _test_simple_evoformer_search(rank, msa_len, pair_len, max_memory):
     node = torch.randn(1, msa_len, pair_len, 256).cuda()
     pair = torch.randn(1, pair_len, pair_len, 128).cuda()
 
-    gm_prop = torch.fx.symbolic_trace(model)    # must use symbolic_trace
-    interp = MetaInfoProp(gm_prop)
+    meta_graph = symbolic_trace(model,
+                                meta_args={
+                                    "node": node.to(torch.device("meta")),
+                                    "pair": pair.to(torch.device("meta")),
+                                })    # must use symbolic_trace
+    interp = MetaInfoProp(meta_graph)
     interp.propagate(MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0"))
-
-    codegen = AutoChunkCodeGen(gm_prop, max_memory=max_memory)
+    codegen = AutoChunkCodeGen(meta_graph, max_memory=max_memory)
     chunk_infos = codegen.chunk_infos
     assert_chunk_infos(chunk_infos, max_memory, msa_len, pair_len)
 

From 35c0c0006e84e1f7272a36f84af609c465aa5d83 Mon Sep 17 00:00:00 2001
From: Super Daniel <78588128+super-dainiu@users.noreply.github.com>
Date: Fri, 20 Jan 2023 10:49:00 +0800
Subject: [PATCH 205/209] [utils] lazy init. (#2148)

* [utils] lazy init.

* [utils] remove description.

* [utils] complete.

* [utils] finalize.

* [utils] fix names.
---
 colossalai/fx/profiler/tensor.py       |  45 ++-
 colossalai/utils/model/experimental.py | 440 +++++++++++++++++++++++++
 2 files changed, 461 insertions(+), 24 deletions(-)
 create mode 100644 colossalai/utils/model/experimental.py

diff --git a/colossalai/fx/profiler/tensor.py b/colossalai/fx/profiler/tensor.py
index 43165305f010..7606f17cf9d5 100644
--- a/colossalai/fx/profiler/tensor.py
+++ b/colossalai/fx/profiler/tensor.py
@@ -1,6 +1,4 @@
 import uuid
-from copy import deepcopy
-from typing import Optional
 
 import torch
 from torch.types import _bool, _device, _dtype
@@ -28,8 +26,6 @@ class MetaTensor(torch.Tensor):
 
     _tensor: torch.Tensor
 
-    __slots__ = ['_tensor']
-
     @staticmethod
     def __new__(cls, elem, fake_device=None):
         # Avoid multiple wrapping
@@ -47,7 +43,7 @@ def __new__(cls, elem, fake_device=None):
             storage_offset=elem.storage_offset(),
             dtype=elem.dtype,
             layout=elem.layout,
-            device=fake_device if fake_device is not None else elem.device,
+            device=fake_device if fake_device is not None else torch.device('cpu'),
             requires_grad=elem.requires_grad)    # deceive the frontend for aten selections
         r._tensor = elem
         # ...the real tensor is held as an element on the tensor.
@@ -59,8 +55,8 @@ def __new__(cls, elem, fake_device=None):
 
     def __repr__(self):
         if self.grad_fn:
-            return f"MetaTensor({self._tensor}, fake_device='{self.device}', grad_fn={self.grad_fn})"
-        return f"MetaTensor({self._tensor}, fake_device='{self.device}')"
+            return f"MetaTensor(..., size={tuple(self.shape)}, device='{self.device}', dtype={self.dtype}, grad_fn={self.grad_fn})"
+        return f"MetaTensor(..., size={tuple(self.shape)}, device='{self.device}', dtype={self.dtype})"
 
     @classmethod
     def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
@@ -76,13 +72,13 @@ def unwrap(x):
                 x = x.to(torch.device('meta'))
             return x
 
+        args = tree_map(unwrap, args)
+        kwargs = tree_map(unwrap, kwargs)
+
         if 'device' in kwargs:
             fake_device = kwargs['device']
             kwargs['device'] = torch.device('meta')
 
-        args = tree_map(unwrap, args)
-        kwargs = tree_map(unwrap, kwargs)
-
         # run aten for backend=CPU but actually on backend=Meta
         out = func(*args, **kwargs)
 
@@ -118,23 +114,24 @@ def to(self, *args, **kwargs) -> torch.Tensor:
             MetaTensor(tensor(..., device='meta', size=(10,)), fake_device='vulkan')
         """
         # this imitates c++ function in the way of @overload
-        device = None
-        for arg in args:
-            if isinstance(arg, str) or isinstance(arg, _device):
-                device = arg
-        if 'device' in kwargs:
-            device = kwargs['device']
-        result = super().to(*args, **kwargs)
-        if device is not None:
-            result = MetaTensor(result, fake_device=device)
-        return result
+        fake_device = None
+
+        def replace(x):
+            nonlocal fake_device
+            if isinstance(x, str) or isinstance(x, _device):
+                fake_device = x
+                return 'meta'
+            return x
+
+        elem = self._tensor.to(*tree_map(replace, args), **tree_map(replace, kwargs))
+        return MetaTensor(elem, fake_device=fake_device)
 
     def cpu(self, *args, **kwargs):
         if self.device.type == 'cpu':
             return self.to(*args, **kwargs)
         return self.to(*args, device='cpu', **kwargs)
 
-    def cuda(self, *args, **kwargs):
-        if self.device.type == 'cuda':
-            return self.to(*args, **kwargs)
-        return self.to(*args, device='cuda', **kwargs)
+    def cuda(self, device=None, non_blocking=False):
+        if device is not None:
+            return self.to(device=device, non_blocking=non_blocking)
+        return self.to(device='cuda:0', non_blocking=non_blocking)
diff --git a/colossalai/utils/model/experimental.py b/colossalai/utils/model/experimental.py
new file mode 100644
index 000000000000..8291227b7ba2
--- /dev/null
+++ b/colossalai/utils/model/experimental.py
@@ -0,0 +1,440 @@
+import contextlib
+import copy
+import gc
+import pprint
+from typing import Callable, List, Optional, Union
+
+import torch
+import torch.nn as nn
+from torch.utils._pytree import tree_map
+
+from colossalai.device.device_mesh import DeviceMesh
+from colossalai.fx.profiler import MetaTensor
+from colossalai.tensor.shape_consistency import ShapeConsistencyManager
+from colossalai.tensor.sharding_spec import ShardingSpec
+
+# reference: https://pytorch.org/cppdocs/notes/tensor_creation.html
+_TorchFactoryMethod = [
+    "arange",
+    "empty",
+    "eye",
+    "full",
+    "linspace",
+    "logspace",
+    "ones",
+    "rand",
+    "randn",
+    "randint",
+    "randperm",
+    "zeros",
+    "tensor",
+]
+
+orig_empty = torch.empty    # avoid override
+
+scm = ShapeConsistencyManager()
+
+
+class LazyTensor(torch.Tensor):
+    """A naive implementation of LazyTensor (https://arxiv.org/pdf/2102.13267.pdf).
+
+    Usage:
+        1. Use ``LazyTensor`` instead of ``torch.Tensor``.
+        >>> x = LazyTensor(torch.zeros, 2, 3)
+        >>> x += 1
+        >>> y = x * x
+        >>> y = y.cuda().half()
+        >>> y[0, 0] = 0
+        >>> y = y.materialize()     # materialize the tensor
+        >>> print(y)
+        tensor([[0., 1., 1.],
+                [1., 1., 1.]], device='cuda:0', dtype=torch.float16)
+
+        2. Generate ``MetaTensor`` from ``LazyTensor``
+        >>> x = LazyTensor(torch.zeros, 2, 3)
+        >>> x.reshape(3, 2)
+        >>> x = x.traceable()    # generate ``MetaTensor``
+        >>> print(x)
+        MetaTensor(..., size=(3, 2), device=cpu, dtype=torch.float32)
+
+        3. Use ``LazyTensor`` to generate sharded ``nn.Parameter``.
+        >>> x = LazyTensor(torch.zeros, 2, 3)
+        >>> x.spec = ...    # some ``ShardingSpec``
+        >>> x.distribute()    # distribute the tensor according to the ``ShardingSpec``
+
+    Warnings:
+        1. Cases that ``LazyTensor`` can't deal with.
+        >>> x = LazyTensor(torch.ones, 2, 3)
+        >>> x[0, 0] = -x[0, 0]    # this will cause infinite recursion
+
+        2. ``LazyTensor.materialize()`` can't be called multiple times.
+        >>> x = LazyTensor(torch.ones, 2, 3)
+        >>> x.materialize()
+        >>> x.materialize()    # this is disallowed
+    """
+
+    _repr = True
+    _meta_data: Optional[MetaTensor] = None    # shape, dtype, device
+    _cached_data: Optional[torch.Tensor] = None    # materialized data
+
+    @staticmethod
+    def __new__(cls, func, *args, dtype=None, device=None, **kwargs):
+        elem = func(*args, dtype=dtype, device='meta', **kwargs)
+        r = torch.Tensor._make_wrapper_subclass(cls,
+                                                elem.size(),
+                                                strides=elem.stride(),
+                                                storage_offset=elem.storage_offset(),
+                                                dtype=elem.dtype,
+                                                layout=elem.layout,
+                                                device=device if device is not None else torch.device('cpu'),
+                                                requires_grad=elem.requires_grad)
+        r._meta_data = MetaTensor(elem, fake_device=device)
+        return r
+
+    def __init__(self, func, *args, dtype=None, device=None, **kwargs):
+        self._factory_method = (func, args, {'dtype': dtype, 'device': device, **kwargs})    # (func, args, kwargs)
+        self._cached_buffer = list()    # (func, args, kwargs)
+        self._spec = None
+        self._data = self
+
+    def __repr__(self):
+        if self._repr:
+            # avoid recursive representation
+            self.__class__._repr = False
+            s = f'LazyTensor(..., size={tuple(self._meta_data.shape)}, device={self._meta_data.device}, dtype={self._meta_data.dtype})\n'\
+                f'factory method: {self._factory_method}\n'\
+                f'cached: {pprint.pformat(self._cached_buffer) if self._cached_data is None else self._cached_data}\n'\
+                f'spec: {self._spec}'
+            self.__class__._repr = True
+            return s
+        else:
+            return 'LazyTensor(...)'
+
+    def materialize(self) -> torch.Tensor:
+        """Materialize the ``LazyTensor`` to ``torch.Tensor``.
+
+        Warnings:
+            Calling ``self.materialize()`` will clear all cached sequence and factory method,
+            because we don't allow materialize the same ``LazyTensor`` twice.
+            This is mentioned in the paper: https://arxiv.org/pdf/2102.13267.pdf (Part 4.3).
+
+        Returns:
+            torch.Tensor: The materialized tensor.
+        """
+        target = self._data._realize_cached_data()
+        if isinstance(self, nn.Parameter):
+            target = nn.Parameter(target, requires_grad=self.requires_grad)
+        self._clear_all()
+        return target
+
+    def traceable(self) -> MetaTensor:
+        """Generate ``MetaTensor`` from ``LazyTensor``. (Mostly for tracing)
+
+        Returns:
+            MetaTensor: The generated ``MetaTensor``.
+        """
+        if isinstance(self, nn.Parameter):
+            return nn.Parameter(self._meta_data, requires_grad=self.requires_grad)
+        else:
+            return self._meta_data
+
+    def distribute(self) -> torch.Tensor:
+        """Distribute the ``LazyTensor`` according to the ``ShardingSpec``.
+
+        Returns:
+            torch.Tensor: The sharded tensor.
+        """
+        if self._spec is None:
+            raise RuntimeError('ShardingSpec is not set for\n{self}')
+        spec, device_mesh = self._spec, self._spec.device_mesh
+        target = self.materialize()
+
+        # TODO(some man): better not be coupled with auto-parallel
+        target.data = scm.apply_for_autoparallel_runtime(target.data, ShardingSpec(device_mesh, target.shape, {}),
+                                                         spec).detach().clone()
+        return target
+
+    def _realize_cached_data(self) -> torch.Tensor:
+        # self._cached_data should be generated after the first call of this function
+        if self._cached_data is None:
+            if self._factory_method is not None:
+                # apply factory method
+                func, args, kwargs = self._factory_method
+
+                # apply cached sequence
+                self._cached_data = self._apply_cache_buffer(func(*args, **kwargs))
+            else:
+                # apply cached sequence only
+                self._cached_data = self._apply_cache_buffer()
+        return self._cached_data
+
+    def _apply_cache_buffer(self, target=None) -> torch.Tensor:
+        # dump all cached sequence
+        # super-dainiu: support methods for single Tensor only
+        def replace(x):
+            if x is self:
+                return target
+            elif isinstance(x, LazyTensor):
+                return x._realize_cached_data()
+            return x
+
+        packed = None
+
+        for (func, args, kwargs) in self._cached_buffer:
+            if func == torch.Tensor.requires_grad_:
+                packed = func, args, kwargs    # requires grad should be set at last
+            else:
+                o = func(*tree_map(replace, args), **tree_map(replace, kwargs))
+                target = o if isinstance(o, torch.Tensor) else target    # if func returns non-Tensor, discard the value
+
+        # super-dainiu: set requires_grad after all inplace-ops are done
+        if packed is not None:
+            func, args, kwargs = packed
+            func(*tree_map(replace, args), **tree_map(replace, kwargs))
+
+        return target
+
+    # clear all means:
+    #   1. clear factory method
+    #   2. clear cached sequence
+    #   3. clear cached data
+    def _clear_all(self):
+        self._cached_data = None
+        self._cached_buffer = None
+        self._data = None
+        gc.collect()    # avoid memory leak
+
+    # cache everything with __torch_function__
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        target = None
+
+        if isinstance(func, torch._C.ScriptMethod):
+
+            def unwrap(x):
+                if isinstance(x, LazyTensor):
+                    return x._meta_data
+                return x
+
+            target: LazyTensor = args[0].clone()
+            target._cached_buffer.append((func, args, kwargs))
+            target._meta_data = getattr(target._meta_data, func.name)(*tree_map(unwrap, args[1:]),
+                                                                      **tree_map(unwrap, kwargs))
+
+        else:
+
+            def unwrap(x):
+                nonlocal target
+                if isinstance(x, LazyTensor):
+                    target = x if (func.__name__.endswith('_') and not (func.__name__.endswith('__'))
+                                   or func.__name__ == "__setitem__") else x.clone()
+                    target._cached_buffer.append((func, args, kwargs))
+                    return x._meta_data
+                return x
+
+            args = tree_map(unwrap, args)
+            kwargs = tree_map(unwrap, kwargs)
+            o = func(*args, **kwargs)
+
+        if isinstance(o, MetaTensor):
+            target._meta_data = o
+            return target
+        else:
+            return o
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        pass    # skip
+
+    def clone(self) -> "LazyTensor":
+        """Create a new ``LazyTensor`` with same cached sequence and factory method.
+
+        Returns:
+            LazyTensor: the new ``LazyTensor``
+        """
+        target = LazyTensor(orig_empty, 0, dtype=self._meta_data.dtype, device=self._meta_data.device)
+        target._factory_method = None
+        target._cached_buffer = list()
+        target._meta_data = self._meta_data.clone()
+        target._cached_data = self._cached_data.clone() if self._cached_data is not None else None
+        target._spec = copy.deepcopy(self._spec)
+        return target
+
+    def detach(self) -> "LazyTensor":
+        target = self.clone()
+        target._cached_buffer.append((torch.Tensor.detach_, (self,), {}))
+        return target
+
+    @property
+    def spec(self) -> ShardingSpec:
+        return self._spec
+
+    @spec.setter
+    def spec(self, other: ShardingSpec):
+        self._spec = other
+
+    @property
+    def data(self) -> "LazyTensor":
+        return self._data.detach()
+
+    @data.setter
+    def data(self, other: "LazyTensor") -> "LazyTensor":
+        """This avoid the following infinite recursion, which is very common in ``nn.Module`` initialization.
+
+        Usage:
+            >>> a = LazyTensor(torch.empty, 0, dtype=torch.float32, device='cpu')
+            >>> b = a.cuda()
+            >>> a.data = b
+        """
+        self._data = other
+
+
+class LazyInitContext():
+    """Context manager for lazy initialization. Enables initializing the model without allocating real memory.
+
+    Usage:
+        1. The model is initialized, but no real memory is allocated.
+        >>> ctx = LazyInitContext()
+        >>> with ctx:
+        >>>     model = MyModel().cuda()
+
+        2. The model is initialized with ``MetaTensor`` as weights, but still no real memory is allocated.
+        >>> with ctx.traceable(model):
+        >>>     gm = symbolic_trace(model, meta_args=meta_args)
+        >>> # Solve the execution strategy and apply the strategy to the model
+        >>> strategy = StrategyAndSpec()
+
+        3. The model is initialized with ``torch.Tensor`` as weights, and real memory is allocated. (single device)
+        >>> model = ctx.materialize(model)
+
+        3. The model is initialized with sharded ``torch.Tensor`` as weights, and real memory is allocated. (distributed scenario)
+        >>> model = apply_strategy_to_all_params(model, strategy)
+        >>> model = ctx.distribute(model)
+
+    Warnings:
+        This API is still experimental and further modifications can be made to it.
+        For example:
+            1. Quantization strategies can be applied before allocating real memory.
+            2. Lazy initialization seems slower than normal initialization.
+    """
+
+    def __init__(self):
+        self.overrides = {}
+
+    def __enter__(self):
+
+        def wrap_factory_method(target):
+            # factory functions (eg. torch.empty())
+            def wrapper(*args, **kwargs):
+                return LazyTensor(target, *args, **kwargs)
+
+            return wrapper, target
+
+        def wrap_factory_like_method(orig_target, target):
+            # factory_like functions (eg. torch.empty_like())
+            def wrapper(*args, **kwargs):
+                orig_t = args[0]
+                return LazyTensor(orig_target, *args[1:], device=orig_t.device, dtype=orig_t.dtype, **kwargs)
+
+            return wrapper, target
+
+        self.overrides = {
+            target: wrap_factory_method(getattr(torch, target))
+            for target in _TorchFactoryMethod
+            if callable(getattr(torch, target, None))
+        }
+
+        self.overrides.update({
+            target + '_like': wrap_factory_like_method(getattr(torch, target), getattr(torch, target + '_like'))
+            for target in _TorchFactoryMethod
+            if callable(getattr(torch, target + '_like', None))
+        })
+
+        for name, (wrapper, orig) in self.overrides.items():
+            setattr(torch, name, wrapper)
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        for name, (wrapper, orig) in self.overrides.items():
+            setattr(torch, name, orig)
+
+    @staticmethod
+    def materialize(module: torch.nn.Module):
+        """Initialize all ``nn.Parameter`` from ``LazyTensor``.
+
+        Args:
+            module (torch.nn.Module): Target ``nn.Module``
+        """
+
+        @torch.no_grad()
+        def init_recursively(module: nn.Module):
+            # recursively initialize the module
+            for mod in module.children():
+                init_recursively(mod)
+
+            # initialize tensors directly attached to the current module
+            for name, param in module.named_parameters(recurse=False):
+                setattr(module, name, param.materialize())
+
+            for name, buf in module.named_buffers(recurse=False):
+                setattr(module, name, buf.materialize())
+
+        init_recursively(module)
+        return module
+
+    @staticmethod
+    def distribute(module: torch.nn.Module):
+        """Initialize and shard all ``nn.Parameter`` from ``LazyTensor``.
+
+        Args:
+            module (torch.nn.Module): Sharded target ``nn.Module``
+        """
+
+        @torch.no_grad()
+        def init_recursively(module: nn.Module):
+            # recursively initialize the module
+            for mod in module.children():
+                init_recursively(mod)
+
+            # initialize tensors directly attached to the current module
+            for name, param in module.named_parameters(recurse=False):
+                setattr(module, name, param.distribute())
+
+            for name, buf in module.named_buffers(recurse=False):
+                setattr(module, name, buf.distribute())
+
+        init_recursively(module)
+        return module
+
+    @staticmethod
+    @contextlib.contextmanager
+    def traceable(module: torch.nn.Module):
+        """Initialize all ``nn.Parameters`` as ``MetaTensor``. This enables ``ColoTracer`` with control flow.
+
+        Args:
+            module (torch.nn.Module): Traceable ``nn.Module`` with ``MetaTensor`` as parameters.
+        """
+        orig_val = dict()
+
+        def init_recursively(module: nn.Module):
+            # recursively initialize the module
+            for mod in module.children():
+                init_recursively(mod)
+
+            # initialize tensors directly attached to the current module
+            for name, param in module.named_parameters(recurse=False):
+                setattr(module, name, param.traceable())
+                orig_val[(module, name)] = param
+
+            for name, buf in module.named_buffers(recurse=False):
+                setattr(module, name, buf.traceable())
+                orig_val[(module, name)] = buf
+
+        init_recursively(module)
+
+        yield
+
+        # restore original values
+        for (module, name), val in orig_val.items():
+            setattr(module, name, val)

From c04f183237da31033bfc2b0b69f2fcc22270fb0b Mon Sep 17 00:00:00 2001
From: oahzxl <43881818+oahzxl@users.noreply.github.com>
Date: Fri, 20 Jan 2023 11:18:17 +0800
Subject: [PATCH 206/209] [autochunk] support parsing blocks (#2506)

---
 colossalai/autochunk/autochunk_codegen.py     |  12 +-
 colossalai/autochunk/estimate_memory.py       |  27 +++
 colossalai/autochunk/search_chunk.py          |  75 ++++++--
 colossalai/autochunk/trace_flow.py            |   5 +-
 colossalai/autochunk/trace_indice.py          |  46 +++++
 colossalai/autochunk/utils.py                 |   8 +
 .../test_evoformer_stack_codegen.py           | 163 ++++++++++++++++++
 7 files changed, 314 insertions(+), 22 deletions(-)
 create mode 100644 tests/test_autochunk/test_evoformer_stack_codegen.py

diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index de5e7356bbfd..8c3155a60685 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -22,7 +22,7 @@
 from torch.fx.node import Argument, Node, _get_qualified_name, _type_repr, map_arg
 
 from .search_chunk import SearchChunk
-from .utils import delete_free_var_from_last_use, find_idx_by_name, get_node_shape
+from .utils import delete_free_var_from_last_use, find_idx_by_name, get_logger, get_node_shape
 
 
 def _gen_chunk_slice_dim(chunk_dim: int, chunk_indice_name: str, shape: List) -> str:
@@ -276,11 +276,17 @@ def emit_code_with_chunk(
 
     class AutoChunkCodeGen(CodeGen):
 
-        def __init__(self, meta_graph, max_memory=None, print_mem=False):
+        def __init__(self,
+                     meta_graph,
+                     max_memory: int = None,
+                     print_mem: bool = False,
+                     print_progress: bool = False) -> None:
             super().__init__()
             # find the chunk regions
-            self.search_chunk = SearchChunk(meta_graph, max_memory, print_mem)
+            self.search_chunk = SearchChunk(meta_graph, max_memory, print_mem, print_progress)
             self.chunk_infos = self.search_chunk.search_region()
+            if print_progress:
+                get_logger().info("AutoChunk start codegen")
 
         def _gen_python_code(self, nodes, root_module: str, namespace: _Namespace) -> PythonCode:
             free_vars: List[str] = []
diff --git a/colossalai/autochunk/estimate_memory.py b/colossalai/autochunk/estimate_memory.py
index 21f34481ba70..a03a5413bc34 100644
--- a/colossalai/autochunk/estimate_memory.py
+++ b/colossalai/autochunk/estimate_memory.py
@@ -43,6 +43,8 @@ def _get_delete_node(self, user, user_to_last_uses, to_keep=None):
         delete_node = []
         if user.op not in ("output",):
             nodes_to_delete = user_to_last_uses.get(user, [])
+            if len(user.users) == 0:
+                nodes_to_delete.append(user)
             if to_keep is not None:
                 keep_list = []
                 for n in nodes_to_delete:
@@ -135,6 +137,8 @@ def _get_chunk_delete_node_size(self, user, user_to_last_uses, chunk_ratio, chun
         if user.op in ("placeholder", "output"):
             return 0
         nodes_to_delete = user_to_last_uses.get(user, [])
+        if len(user.users) == 0:
+            nodes_to_delete.append(user)
         delete_size = 0
         for n in nodes_to_delete:
             if n.name in chunk_inputs_names:
@@ -294,3 +298,26 @@ def estimate_chunk_inference_mem(
         # param_memory = parameter_size(gm)
         # all_memory = act_memory + param_memory
         return act_memory_peak_log, act_memory_after_node_log, active_node_list_log
+
+    def get_active_nodes(self, node_list: List) -> List:
+        """
+        Get active nodes for every node
+
+        Args:
+            node_list (List): _description_
+
+        Returns:
+            active_node_list_log (List): active nodes of every node. active nodes refer to
+                nodes generated but not deleted.
+        """
+        active_node_list = []
+        active_node_list_log = []
+        user_to_last_uses = self._get_last_usr(node_list)
+        user_to_last_uses_no_free_var = self._get_last_usr(node_list)
+        delete_free_var_from_last_use(user_to_last_uses_no_free_var)
+        for _, node in enumerate(node_list):
+            # log active node, only effective without chunk
+            self._add_active_node(node, active_node_list)
+            self._remove_deactive_node(node, user_to_last_uses, active_node_list)
+            active_node_list_log.append(copy.deepcopy(active_node_list))
+        return active_node_list_log
diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py
index 236f9697df5d..a8619671268b 100644
--- a/colossalai/autochunk/search_chunk.py
+++ b/colossalai/autochunk/search_chunk.py
@@ -8,7 +8,7 @@
 from .select_chunk import SelectChunk
 from .trace_flow import TraceFlow
 from .trace_indice import TraceIndice
-from .utils import get_node_shape, is_non_compute_node, is_non_compute_node_except_placeholder
+from .utils import get_logger, get_node_shape, is_non_compute_node, is_non_compute_node_except_placeholder
 
 
 class SearchChunk(object):
@@ -40,14 +40,14 @@ class SearchChunk(object):
         print_mem (bool): print estimated memory
     """
 
-    def __init__(self, gm, max_memory=None, print_mem=False) -> None:
-        self.gm = gm
+    def __init__(self, gm, max_memory=None, print_mem=False, print_progress=False) -> None:
         self.print_mem = print_mem
+        self.print_progress = print_progress
         self.trace_indice = TraceIndice(list(gm.graph.nodes))
-        self.trace_indice.trace_indice()
+        self.estimate_memory = EstimateMemory()
+        self._init_trace()
         self.trace_flow = TraceFlow(self.trace_indice)
         self.reorder_graph = ReorderGraph(self.trace_indice)
-        self.estimate_memory = EstimateMemory()
         self.select_chunk = SelectChunk(
             self.trace_indice,
             self.estimate_memory,
@@ -55,7 +55,33 @@ def __init__(self, gm, max_memory=None, print_mem=False) -> None:
             max_memory=max_memory,
         )
 
-    def _find_peak_node(self, mem_peak):
+    def _init_trace(self) -> None:
+        """
+        find the max trace range for every node
+        reduce the computation complexity of trace_indice
+        """
+        # find all max ranges
+        active_nodes = self.estimate_memory.get_active_nodes(self.trace_indice.node_list)
+        cur_node_idx = len(self._get_free_var_idx())
+        max_chunk_region_list = []
+        while True:
+            max_chunk_region = self._search_max_chunk_region(active_nodes, cur_node_idx)
+            cur_node_idx = max_chunk_region[1]
+            if cur_node_idx == len(active_nodes) - 1:
+                break
+            max_chunk_region_list.append(max_chunk_region)
+
+        # nothing to limit for the first range
+        max_chunk_region_list = max_chunk_region_list[1:]
+        max_chunk_region_list[0] = (0, max_chunk_region_list[0][1])
+
+        # set trace range and do the trace
+        if self.print_progress:
+            get_logger().info("AutoChunk start tracing indice")
+        self.trace_indice.set_trace_range(max_chunk_region_list, active_nodes)
+        self.trace_indice.trace_indice()
+
+    def _find_peak_node(self, mem_peak: List) -> int:
         max_value = max(mem_peak)
         max_idx = mem_peak.index(max_value)
         return max_idx
@@ -73,7 +99,7 @@ def _get_free_var_idx(self) -> List:
                 free_var_idx.append(idx)
         return free_var_idx
 
-    def _search_max_chunk_region(self, active_node: List, peak_node: Node, chunk_regions: List) -> Tuple:
+    def _search_max_chunk_region(self, active_node: List, peak_node_idx: int, chunk_regions: List = None) -> Tuple:
         """
         Search max chunk region according to peak memory node
 
@@ -81,7 +107,7 @@ def _search_max_chunk_region(self, active_node: List, peak_node: Node, chunk_reg
 
         Args:
             active_node (List): active node status for every node
-            peak_node (Node): peak memory node
+            peak_node_idx (int): peak memory node idx
             chunk_regions (List): chunk region infos
 
         Returns:
@@ -97,7 +123,7 @@ def _search_max_chunk_region(self, active_node: List, peak_node: Node, chunk_reg
         # from peak_node to free_var
         inside_flag = False
         chunk_region_start = free_var_num
-        for i in range(peak_node, -1, -1):
+        for i in range(peak_node_idx, -1, -1):
             if active_node_num[i] <= threshold:
                 inside_flag = True
             if inside_flag and active_node_num[i] > threshold:
@@ -107,21 +133,23 @@ def _search_max_chunk_region(self, active_node: List, peak_node: Node, chunk_reg
         # from peak_node to len-2
         inside_flag = False
         chunk_region_end = len(active_node) - 1
-        for i in range(peak_node, len(active_node)):
+        for i in range(peak_node_idx, len(active_node)):
             if active_node_num[i] <= threshold:
                 inside_flag = True
             if inside_flag and active_node_num[i] > threshold:
                 chunk_region_end = i
                 break
 
-        for i in chunk_regions:
-            region = i["region"]
-            if chunk_region_start >= region[0] and chunk_region_end <= region[1]:
-                return None
-            elif (region[0] <= chunk_region_start <= region[1] and chunk_region_end > region[1]):
-                chunk_region_start = region[1] + 1
-            elif (region[0] <= chunk_region_end <= region[1] and chunk_region_start < region[0]):
-                chunk_region_end = region[0] - 1
+        # avoid chunk regions overlap
+        if chunk_regions is not None:
+            for i in chunk_regions:
+                region = i["region"]
+                if chunk_region_start >= region[0] and chunk_region_end <= region[1]:
+                    return None
+                elif (region[0] <= chunk_region_start <= region[1] and chunk_region_end > region[1]):
+                    chunk_region_start = region[1] + 1
+                elif (region[0] <= chunk_region_end <= region[1] and chunk_region_start < region[0]):
+                    chunk_region_end = region[0] - 1
         return chunk_region_start, chunk_region_end
 
     def _find_chunk_info(self, input_trace, output_trace, start_idx, end_idx) -> List:
@@ -154,6 +182,9 @@ def _find_chunk_info(self, input_trace, output_trace, start_idx, end_idx) -> Lis
                     # dim size cannot be 1
                     if (get_node_shape(end_node)[end_dim] == 1 or get_node_shape(start_node)[start_dim] == 1):
                         continue
+                    # must have users
+                    if len(end_node.users) == 0:
+                        continue
                     # check index source align
                     if not self.trace_flow.check_index_source(start_dim, start_node, start_idx, end_dim, end_node):
                         continue
@@ -253,6 +284,9 @@ def search_region(self) -> Dict:
         Returns:
             chunk_infos (Dict)
         """
+        if self.print_progress:
+            get_logger().info("AutoChunk start searching chunk regions")
+
         chunk_infos = []
         (
             init_mem_peak,
@@ -272,6 +306,11 @@ def search_region(self) -> Dict:
                 _,
                 active_node,
             ) = self.estimate_memory.estimate_chunk_inference_mem(self.trace_indice.node_list, chunk_infos)
+
+            if self.print_progress:
+                get_logger().info("AutoChunk find chunk region %d = (%d, %d)" %
+                                  (len(chunk_infos), chunk_info["region"][0], chunk_info["region"][1]))
+
             if self._stop_search(init_mem_peak, mem_peak):
                 break
         if self.print_mem:
diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py
index e657c188ead2..830b4629ec1e 100644
--- a/colossalai/autochunk/trace_flow.py
+++ b/colossalai/autochunk/trace_flow.py
@@ -281,7 +281,10 @@ def _get_input_nodes_dim(self, inputs: List[Node], start_idx: int, end_idx: int,
                     if chunk_dim is not None:
                         user_source = self.trace_indice._find_source_trace_from_node(user)[chunk_dim]
                         if input_node_idx in user_source:
-                            input_dict[user_idx] = user_source[input_node_idx]
+                            if get_node_shape(input_node)[user_source[input_node_idx][0]] == 1:
+                                input_dict[user_idx] = [None]
+                            else:
+                                input_dict[user_idx] = user_source[input_node_idx]
                         else:
                             return None, None
             if len(input_dict) == 0:
diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 5c2e9b5203b5..827f60d8b53d 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -33,6 +33,8 @@ def __init__(self, node_list: List[Node]) -> None:
         self.indice_trace_list = self._init_indice_trace_list()
         self.indice_view_list = {}
         self.indice_count = -1
+        self.trace_range = []
+        self.active_node_list = []
 
     def _init_indice_trace_list(self):
         indice_trace_list = []
@@ -48,6 +50,10 @@ def _init_indice_trace_list(self):
             indice_trace_list.append(cur_trace)
         return indice_trace_list
 
+    def set_trace_range(self, trace_range: List, active_node_list: List) -> None:
+        self.trace_range = trace_range
+        self.active_node_list = active_node_list
+
     def _add_indice(self):
         """
         Update the count and return it. To record the idx number.
@@ -493,6 +499,9 @@ def _assign_getitem_indice(self, node: Node, node_idx: int):
         new_dim_num = sum([1 if str(i) == "None" else 0 for i in node_args])
         for _ in range(new_dim_num):
             self._del_dim(node_idx, 0)
+        delete_dim_num = sum([1 if str(i) == "0" else 0 for i in node_args])
+        for _ in range(delete_dim_num):
+            self._add_dim(node_idx, 0)
         self._assign_indice_as_input(node, node_idx)
 
         for _, node_arg in enumerate(node_args):
@@ -513,6 +522,9 @@ def _assign_getitem_indice(self, node: Node, node_idx: int):
             elif "None" == node_arg_str:
                 self._add_dim(node_idx, new_idx_count)
                 new_idx_count += 1
+            elif "0" == node_arg_str:
+                self._del_dim(node_idx, new_idx_count)
+                origin_idx_count += 1
             else:
                 raise NotImplementedError()
 
@@ -596,6 +608,37 @@ def _assign_view_reshape_indice(self, node: Node, node_idx: int):
         }
         self.indice_view_list[node] = view_dict
 
+    def _clear_trace(self, node_idx: int) -> None:
+        """
+        clear too far trace to speed up computation
+        """
+        trace_range = None
+        for i in range(len(self.trace_range)):
+            if self.trace_range[i][1] == node_idx:
+                trace_range = (self.trace_range[i][0], self.trace_range[i][1])
+                break
+            if self.trace_range[i][1] > node_idx:
+                break
+        if trace_range is None:
+            return
+
+        active_nodes = self.active_node_list[trace_range[0]:trace_range[1] + 1]
+        active_nodes = set(flat_list(active_nodes))
+        active_nodes = [find_idx_by_name(i, self.node_list) for i in active_nodes]
+        for i in range(trace_range[0], trace_range[1] + 1):
+            trace = self.indice_trace_list[i]
+            # clear compute
+            for dim_compute in trace["compute"]:
+                for i in range(len(dim_compute) - 1, -1, -1):
+                    if dim_compute[i] < trace_range[0] and dim_compute[i] not in active_nodes:
+                        dim_compute.pop(i)
+                continue
+            # clear source
+            for dim_source in trace["source"]:
+                for k in list(dim_source.keys()):
+                    if k < trace_range[0] and k not in active_nodes:
+                        dim_source.pop(k)
+
     def trace_indice(self):
         for idx, node in enumerate(self.node_list):
             if node.op == "placeholder":
@@ -655,3 +698,6 @@ def trace_indice(self):
                 continue
             else:
                 raise NotImplementedError(node.op, "op not implemented yet!")
+
+            # limit trace range
+            self._clear_trace(idx)
diff --git a/colossalai/autochunk/utils.py b/colossalai/autochunk/utils.py
index ff1a64bc359d..e870685122e3 100644
--- a/colossalai/autochunk/utils.py
+++ b/colossalai/autochunk/utils.py
@@ -2,6 +2,14 @@
 
 from torch.fx.node import Node
 
+from colossalai.logging import get_dist_logger
+
+logger = get_dist_logger()
+
+
+def get_logger():
+    return logger
+
 
 def flat_list(inputs: Any) -> List:
     """
diff --git a/tests/test_autochunk/test_evoformer_stack_codegen.py b/tests/test_autochunk/test_evoformer_stack_codegen.py
new file mode 100644
index 000000000000..5fabb27028f9
--- /dev/null
+++ b/tests/test_autochunk/test_evoformer_stack_codegen.py
@@ -0,0 +1,163 @@
+from functools import partial
+
+import pytest
+import torch
+import torch.fx
+import torch.multiprocessing as mp
+
+try:
+    from fastfold.model.nn.evoformer import EvoformerStack
+    HAS_REPO = True
+except:
+    HAS_REPO = False
+
+import colossalai
+from colossalai.core import global_context as gpc
+from colossalai.fx._compatibility import is_compatible_with_meta
+from colossalai.fx.codegen.activation_checkpoint_codegen import CODEGEN_AVAILABLE
+from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.fx.passes.meta_info_prop import MetaInfoProp
+from colossalai.utils import free_port
+
+if CODEGEN_AVAILABLE and is_compatible_with_meta():
+    from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
+    from colossalai.fx.profiler import MetaTensor
+    from colossalai.fx.tracer.experimental import ColoTracer, symbolic_trace
+
+
+def _test_fwd(model: torch.nn.Module, gm: ColoGraphModule, node, pair, node_mask, pair_mask):
+    # for memory test
+    # model = model.cuda()
+    # torch.cuda.reset_peak_memory_stats()
+    # now_mem = torch.cuda.memory_allocated() / 1024**2
+    # with torch.no_grad():
+    #     node1 = node.clone()
+    #     pair1 = pair.clone()
+    #     node_mask1 = node_mask.clone()
+    #     pair_mask1 = pair_mask.clone()
+    #     gm(node1, pair1, node_mask1, pair_mask1, None)
+    # new_max_mem = torch.cuda.max_memory_allocated() / 1024**2
+    # print("autochunk max mem:%.2f"% (new_max_mem - now_mem))
+
+    # test forward
+    model = model.cuda()
+    with torch.no_grad():
+        non_fx_out = model(node, pair, node_mask, pair_mask, None)
+        fx_out = gm(node, pair, node_mask, pair_mask, None)
+
+    assert torch.allclose(non_fx_out[0], fx_out[0],
+                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
+                              torch.abs(non_fx_out[0] - fx_out[0]))
+    assert torch.allclose(non_fx_out[1], fx_out[1],
+                          atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
+                              torch.abs(non_fx_out[1] - fx_out[1]))
+
+
+def _build_openfold():
+    model = EvoformerStack(
+        c_m=256,
+        c_z=128,
+        c_hidden_msa_att=32,
+        c_hidden_opm=32,
+        c_hidden_mul=128,
+        c_hidden_pair_att=32,
+        c_s=384,
+        no_heads_msa=8,
+        no_heads_pair=4,
+        no_blocks=2,    # 48
+        transition_n=4,
+        msa_dropout=0.15,
+        pair_dropout=0.25,
+        blocks_per_ckpt=None,
+        inf=1000000000.0,
+        eps=1e-08,
+        clear_cache_between_blocks=False,
+        is_multimer=False,
+    ).eval().cuda()
+    return model
+
+
+def _test_evoformer_stack_codegen(rank, msa_len, pair_len, max_memory):
+    # launch colossalai
+    colossalai.launch(
+        config={},
+        rank=rank,
+        world_size=1,
+        host="localhost",
+        port=free_port(),
+        backend="nccl",
+    )
+
+    # build model and input
+    model = _build_openfold()
+    node = torch.randn(1, msa_len, pair_len, 256).cuda()
+    node_mask = torch.randn(1, msa_len, pair_len).cuda()
+    pair = torch.randn(1, pair_len, pair_len, 128).cuda()
+    pair_mask = torch.randn(1, pair_len, pair_len).cuda()
+
+    # trace the meta graph and setup codegen
+    meta_graph = symbolic_trace(
+        model,
+        meta_args={
+            "m": node.to(torch.device("meta")),
+            "z": pair.to(torch.device("meta")),
+            "msa_mask": node_mask.to(torch.device("meta")),
+            "pair_mask": pair_mask.to(torch.device("meta")),
+        },
+        concrete_args={
+            "chunk_size": None,
+            "_mask_trans": True,
+        },
+    )
+    interp = MetaInfoProp(meta_graph)
+    interp.propagate(MetaTensor(node, fake_device="cuda:0"), MetaTensor(pair, fake_device="cuda:0"),
+                     MetaTensor(node_mask, fake_device="cuda:0"), MetaTensor(pair_mask, fake_device="cuda:0"), None)
+    codegen = AutoChunkCodeGen(meta_graph, max_memory=max_memory, print_mem=False, print_progress=False)
+
+    # trace and recompile
+    # MetaInfoProp requires symbolic_trace but CodeGen requires ColoTracer
+    graph = ColoTracer().trace(
+        model,
+        meta_args={
+            "m": node.to(torch.device("meta")),
+            "z": pair.to(torch.device("meta")),
+            "msa_mask": node_mask.to(torch.device("meta")),
+            "pair_mask": pair_mask.to(torch.device("meta")),
+        },
+        concrete_args={
+            "chunk_size": None,
+            "_mask_trans": True,
+        },
+    )
+    graph.set_codegen(codegen)
+    gm = ColoGraphModule(model, graph, ckpt_codegen=False)
+    gm.recompile()
+
+    # assert we have inserted chunk
+    code = graph.python_code("self").src
+    # print(code)
+    assert "chunk_result = None;  chunk_size = None;" in code
+
+    _test_fwd(model, gm, node, pair, node_mask, pair_mask)
+    gpc.destroy()
+
+
+@pytest.mark.skipif(
+    not (CODEGEN_AVAILABLE and is_compatible_with_meta() and HAS_REPO),
+    reason="torch version is lower than 1.12.0",
+)
+@pytest.mark.parametrize("max_memory", [None, 24, 28, 32])
+@pytest.mark.parametrize("msa_len", [32])
+@pytest.mark.parametrize("pair_len", [64])
+def test_evoformer_stack_codegen(msa_len, pair_len, max_memory):
+    run_func = partial(
+        _test_evoformer_stack_codegen,
+        msa_len=msa_len,
+        pair_len=pair_len,
+        max_memory=max_memory,
+    )
+    mp.spawn(run_func, nprocs=1)
+
+
+if __name__ == "__main__":
+    _test_evoformer_stack_codegen(0, 32, 64, None)

From 2d1a7dfe5f9f469ee75ec9c1a17f129766751fbb Mon Sep 17 00:00:00 2001
From: HELSON <c2h214748@gmail.com>
Date: Fri, 20 Jan 2023 14:04:38 +0800
Subject: [PATCH 207/209] [zero] add strict ddp mode (#2508)

* [zero] add strict ddp mode

* [polish] add comments for strict ddp mode

* [zero] fix test error
---
 colossalai/nn/parallel/data_parallel.py           | 12 ++++++++++--
 colossalai/nn/parallel/gemini_parallel.py         |  3 ++-
 examples/language/gpt/gemini/commons/model_zoo.py | 12 ++++++++++++
 examples/language/gpt/gemini/train_gpt_demo.py    | 10 ++++++----
 tests/test_tensor/test_tp_with_zero.py            |  2 +-
 5 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/colossalai/nn/parallel/data_parallel.py b/colossalai/nn/parallel/data_parallel.py
index 28a10c4b6c92..a742946f4c50 100644
--- a/colossalai/nn/parallel/data_parallel.py
+++ b/colossalai/nn/parallel/data_parallel.py
@@ -12,6 +12,7 @@
 from colossalai.logging import get_dist_logger
 from colossalai.nn.parallel.utils import get_temp_total_chunk_on_cuda
 from colossalai.tensor import ProcessGroup as ColoProcessGroup
+from colossalai.tensor import ReplicaSpec
 from colossalai.tensor.colo_parameter import ColoParameter, ColoTensor, ColoTensorSpec
 from colossalai.tensor.param_op_hook import ColoParamOpHookManager
 from colossalai.utils import get_current_device, is_ddp_ignored
@@ -200,14 +201,18 @@ class ZeroDDP(ColoDDP):
         gemini_manager (GeminiManager): Manages the chunk manager and heterogeneous momery space.
             For more details, see the API reference of ``GeminiManager``.
         pin_memory (bool): Chunks on CPU Memory use pin-memory.
-        force_outputs_fp32 (bool): If set to True, outputs will be fp32. Otherwise, outputs will be fp16.  Defaults to False.
+        force_outputs_fp32 (bool): If set to True, outputs will be fp32. Otherwise, outputs will be fp16.
+            Defaults to False.
+        strict_ddp_mode (bool): If set to True, there is no tensor sharding, each tensor is replicated.
+            Defaults to False. Users can set it to True, when they clearly know that they only need DDP.
     """
 
     def __init__(self,
                  module: torch.nn.Module,
                  gemini_manager: GeminiManager,
                  pin_memory: bool = False,
-                 force_outputs_fp32: bool = False) -> None:
+                 force_outputs_fp32: bool = False,
+                 strict_ddp_mode: bool = False) -> None:
         super().__init__(module, process_group=ColoProcessGroup())
         self.gemini_manager = gemini_manager
         self.chunk_manager: ChunkManager = gemini_manager.chunk_manager
@@ -232,6 +237,9 @@ def __init__(self,
         for p in param_order.generate():
             assert isinstance(p, ColoParameter)
 
+            if strict_ddp_mode and not p.is_replicate():
+                p.set_dist_spec(ReplicaSpec())
+
             if is_ddp_ignored(p):
                 p.data = p.data.to(device=get_current_device(), dtype=torch.float16)
                 continue
diff --git a/colossalai/nn/parallel/gemini_parallel.py b/colossalai/nn/parallel/gemini_parallel.py
index cd5ef424a1d9..868a3960fd3c 100644
--- a/colossalai/nn/parallel/gemini_parallel.py
+++ b/colossalai/nn/parallel/gemini_parallel.py
@@ -17,6 +17,7 @@ def __init__(self,
                  placement_policy: str = "cpu",
                  pin_memory: bool = False,
                  force_outputs_fp32: bool = False,
+                 strict_ddp_mode: bool = False,
                  search_range_mb: int = 32,
                  hidden_dim: Optional[int] = None,
                  min_chunk_size_mb: Optional[float] = None,
@@ -54,4 +55,4 @@ def __init__(self,
                                            search_range_mb=search_range_mb,
                                            min_chunk_size_mb=min_chunk_size_mb)
         gemini_manager = GeminiManager(placement_policy, chunk_manager, memstats)
-        super().__init__(module, gemini_manager, pin_memory, force_outputs_fp32)
+        super().__init__(module, gemini_manager, pin_memory, force_outputs_fp32, strict_ddp_mode)
diff --git a/examples/language/gpt/gemini/commons/model_zoo.py b/examples/language/gpt/gemini/commons/model_zoo.py
index c31b3fa6d103..65124d9e4884 100644
--- a/examples/language/gpt/gemini/commons/model_zoo.py
+++ b/examples/language/gpt/gemini/commons/model_zoo.py
@@ -53,6 +53,14 @@ def gpt2_24b(checkpoint=True):
     return GPTLMModel(hidden_size=8192, num_layers=30, num_attention_heads=16, checkpoint=checkpoint)
 
 
+def gpt2_30b(checkpoint=True):
+    return GPTLMModel(hidden_size=8192, num_layers=37, num_attention_heads=16, checkpoint=checkpoint)
+
+
+def gpt2_40b(checkpoint=True):
+    return GPTLMModel(hidden_size=8192, num_layers=50, num_attention_heads=16, checkpoint=checkpoint)
+
+
 def model_builder(model_size: str) -> callable:
     if model_size == "gpt2_medium":
         return gpt2_medium
@@ -66,6 +74,10 @@ def model_builder(model_size: str) -> callable:
         return gpt2_20b
     elif model_size == "gpt2_24b":
         return gpt2_24b
+    elif model_size == "gpt2_30b":
+        return gpt2_30b
+    elif model_size == "gpt2_40b":
+        return gpt2_40b
     else:
         raise TypeError(f"model_builder {model_size}")
 
diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py
index 713de6f9fb45..285706596e8f 100644
--- a/examples/language/gpt/gemini/train_gpt_demo.py
+++ b/examples/language/gpt/gemini/train_gpt_demo.py
@@ -187,17 +187,18 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
 
 
 # Gemini + ZeRO DDP
-def build_gemini(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"):
+def build_gemini(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto", ddp_flag: bool = True):
     fp16_init_scale = 2**5
     gpu_margin_mem_ratio_for_auto = 0
 
     if version.parse(CAI_VERSION) > version.parse("0.1.10"):
         model = GeminiDDP(model,
+                          strict_ddp_mode=ddp_flag,
                           device=get_current_device(),
                           placement_policy=placement_policy,
                           pin_memory=True,
                           hidden_dim=model.config.n_embd,
-                          search_range_mb=64)
+                          search_range_mb=128)
         # configure the const policy
         if placement_policy == 'const':
             model.gemini_manager._placement_policy.set_const_memory_boundary(2 * 1024)
@@ -279,11 +280,12 @@ def main():
         tp_pg = ProcessGroup(tp_degree=args.tp_degree)
         # Tensor Parallelism (TP)
         # You should notice that v0.1.10 is not compatible with TP degree > 1
-        tensor_parallelize(model, tp_pg)
+        if args.tp_degree > 1:
+            tensor_parallelize(model, tp_pg)
 
         # build a Gemini model and a highly optimized cpu optimizer
         # Gemini + ZeRO DP, Note it must be used after TP
-        model, optimizer = build_gemini(model, tp_pg, args.placement)
+        model, optimizer = build_gemini(model, tp_pg, args.placement, args.tp_degree == 1)
 
         logger.info(get_mem_info(prefix='After init optim, '), ranks=[0])
     else:
diff --git a/tests/test_tensor/test_tp_with_zero.py b/tests/test_tensor/test_tp_with_zero.py
index 7e611e8a14f9..83645bc6ebfd 100644
--- a/tests/test_tensor/test_tp_with_zero.py
+++ b/tests/test_tensor/test_tp_with_zero.py
@@ -93,7 +93,7 @@ def run_gpt(placement_policy, tp_init_spec_func=None):
     else:
         init_device = None
 
-    model = GeminiDDP(model, init_device, placement_policy, True, False, 32)
+    model = GeminiDDP(model, init_device, placement_policy, True, False)
     # The same as the following 3 lines
     # chunk_manager = ChunkManager(config_dict, init_device=init_device)
     # gemini_manager = GeminiManager(placement_policy, chunk_manager)

From a6a10616ec2dafca6640efd2d2e4029e2469512c Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Fri, 20 Jan 2023 17:29:13 +0800
Subject: [PATCH 208/209] [doc] update opt and tutorial links (#2509)

---
 README-zh-Hans.md | 6 ++++--
 README.md         | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index 6b1848c4bdd7..5ad22785cef5 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -145,7 +145,7 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/OPT_update.png" width=800/>
 
 - [Open Pretrained Transformer (OPT)](https://github.com/facebookresearch/metaseq), 由Meta发布的1750亿语言模型，由于完全公开了预训练参数权重，因此促进了下游任务和应用部署的发展。
-- 加速45%，仅用几行代码以低成本微调OPT。[[样例]](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/opt) [[在线推理]](https://service.colossalai.org/opt)
+- 加速45%，仅用几行代码以低成本微调OPT。[[样例]](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/opt) [[在线推理]](https://github.com/hpcaitech/ColossalAI-Documentation/blob/main/i18n/zh-Hans/docusaurus-plugin-content-docs/current/advanced_tutorials/opt_service.md)
 
 请访问我们的 [文档](https://www.colossalai.org/) 和 [例程](https://github.com/hpcaitech/ColossalAI-Examples) 以了解详情。
 
@@ -199,7 +199,7 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/OPT_serving.png" width=800/>
 </p>
 
-- [OPT推理服务](https://service.colossalai.org/opt): 无需注册，免费体验1750亿参数OPT在线推理服务
+- [OPT推理服务](https://github.com/hpcaitech/ColossalAI-Documentation/blob/main/i18n/zh-Hans/docusaurus-plugin-content-docs/current/advanced_tutorials/opt_service.md): 无需注册，免费体验1750亿参数OPT在线推理服务
 
 <p id="BLOOM-Inference" align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/BLOOM%20Inference.PNG" width=800/>
@@ -365,4 +365,6 @@ docker run -ti --gpus all --rm --ipc=host colossalai bash
 }
 ```
 
+Colossal-AI 已被 [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/), [PPoPP](https://ppopp23.sigplan.org/) 等顶级会议录取为官方教程。
+
 <p align="right">(<a href="#top">返回顶端</a>)</p>
diff --git a/README.md b/README.md
index 396260e97399..01e7b0ec50e4 100644
--- a/README.md
+++ b/README.md
@@ -149,7 +149,7 @@ distributed training and inference in a few lines.
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/OPT_update.png" width=800/>
 
 - [Open Pretrained Transformer (OPT)](https://github.com/facebookresearch/metaseq), a 175-Billion parameter AI language model released by Meta, which stimulates AI programmers to perform various downstream tasks and application deployments because public pretrained model weights.
-- 45% speedup fine-tuning OPT at low cost in lines. [[Example]](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/opt) [[Online Serving]](https://service.colossalai.org/opt)
+- 45% speedup fine-tuning OPT at low cost in lines. [[Example]](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/opt) [[Online Serving]](https://github.com/hpcaitech/ColossalAI-Documentation/blob/main/i18n/en/docusaurus-plugin-content-docs/current/advanced_tutorials/opt_service.md)
 
 Please visit our [documentation](https://www.colossalai.org/) and [examples](https://github.com/hpcaitech/ColossalAI-Examples) for more details.
 
@@ -202,7 +202,7 @@ Please visit our [documentation](https://www.colossalai.org/) and [examples](htt
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/OPT_serving.png" width=800/>
 </p>
 
-- [OPT Serving](https://service.colossalai.org/opt): Try 175-billion-parameter OPT online services for free, without any registration whatsoever.
+- [OPT Serving](https://github.com/hpcaitech/ColossalAI-Documentation/blob/main/i18n/en/docusaurus-plugin-content-docs/current/advanced_tutorials/opt_service.md): Try 175-billion-parameter OPT online services for free, without any registration whatsoever.
 
 <p id="BLOOM-Inference" align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/BLOOM%20Inference.PNG" width=800/>
@@ -369,4 +369,6 @@ We leverage the power of [GitHub Actions](https://github.com/features/actions) t
 }
 ```
 
+Colossal-AI has been accepted as official tutorials by top conference [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/), [PPoPP](https://ppopp23.sigplan.org/), etc.
+
 <p align="right">(<a href="#top">back to top</a>)</p>

From 0af793836c82ae5e9ee056caab91d9aafa781a43 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 26 Jan 2023 16:34:19 +0800
Subject: [PATCH 209/209] [workflow] fixed changed file detection (#2515)

---
 .github/workflows/auto_example_check.yml | 17 +++++++++++++++++
 .github/workflows/build.yml              | 11 +++++++++++
 .github/workflows/pre_commit.yml         | 14 ++++++++++++++
 3 files changed, 42 insertions(+)

diff --git a/.github/workflows/auto_example_check.yml b/.github/workflows/auto_example_check.yml
index 5e4022f7f0ea..df413f646c2c 100644
--- a/.github/workflows/auto_example_check.yml
+++ b/.github/workflows/auto_example_check.yml
@@ -25,9 +25,21 @@ jobs:
         with:
           fetch-depth: 0
           ref: ${{ github.event.pull_request.head.sha }}
+
+      - name: Locate base commit
+        id: locate-base-sha
+        run: |
+            curBranch=$(git rev-parse --abbrev-ref HEAD)
+            commonCommit=$(git merge-base origin/main $curBranch)
+            echo $commonCommit
+            echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT
+
       - name: Get all changed example files
         id: changed-files
         uses: tj-actions/changed-files@v35
+        with:
+          base_sha: ${{ steps.locate-base-sha.outputs.baseSHA }}
+
       - name: setup matrix
         id: setup-matrix
         run: |
@@ -67,9 +79,11 @@ jobs:
     timeout-minutes: 10
     steps:
       - uses: actions/checkout@v3
+
       - name: Install Colossal-AI
         run: |
           pip install -v .
+
       - name: Test the example
         run: |
           example_dir=${{ matrix.directory }}
@@ -90,6 +104,7 @@ jobs:
     steps:
     - name: 📚 Checkout
       uses: actions/checkout@v3
+
     - name: setup matrix
       id: setup-matrix
       run: |
@@ -113,9 +128,11 @@ jobs:
     steps:
       - name: 📚 Checkout
         uses: actions/checkout@v3
+
       - name: Install Colossal-AI
         run: |
           pip install -v .
+
       - name: Traverse all files
         run: |
           example_dir=${{ matrix.diretory }}
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 30b932729019..8f334d599124 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -21,14 +21,25 @@ jobs:
         with:
           fetch-depth: 0
           ref: ${{ github.event.pull_request.head.sha }}
+
+      - name: Locate base commit
+        id: locate-base-sha
+        run: |
+            curBranch=$(git rev-parse --abbrev-ref HEAD)
+            commonCommit=$(git merge-base origin/main $curBranch)
+            echo $commonCommit
+            echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT
+
       - name: Find the changed files
         id: find-changed-files
         uses: tj-actions/changed-files@v35
         with:
+          base_sha: ${{ steps.locate-base-sha.outputs.baseSHA }}
           files: |
             op_builder/**
             colossalai/kernel/**
             setup.py
+
       - name: List changed files
         run: |
           for file in ${{ steps.find-changed-files.outputs.all_changed_files }}; do
diff --git a/.github/workflows/pre_commit.yml b/.github/workflows/pre_commit.yml
index 113f50ee0569..3e71be2fc611 100644
--- a/.github/workflows/pre_commit.yml
+++ b/.github/workflows/pre_commit.yml
@@ -12,9 +12,23 @@ jobs:
           fetch-depth: 0
           ref: ${{ github.event.pull_request.head.sha }}
 
+    # the PR branch and the hpcaitech/colossal-ai main branch
+    # must share a common commit, we need to locate that commit,
+    # which is the commit checked-out or forked when the PR branch is created
+    # such that we can look for files changed since that commit
+    - name: Locate base commit
+      id: locate-base-sha
+      run: |
+          curBranch=$(git rev-parse --abbrev-ref HEAD)
+          commonCommit=$(git merge-base origin/main $curBranch)
+          echo $commonCommit
+          echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT
+
     - name: Find the changed files
       id: find-changed-files
       uses: tj-actions/changed-files@v35
+      with:
+        base_sha: ${{ steps.locate-base-sha.outputs.baseSHA }}
 
     - name: List all changed files
       run: |