Skip to content
Merged

Lf #54

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion colossalai/cli/launcher/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
type=str,
default=None,
help=
"Specify computing devices to NOT use during execution. Mutually exclusive with --include. Formatting is the same as --includ,"
"Specify computing devices to NOT use during execution. Mutually exclusive with --include. Formatting is the same as --include,"
" only effective when used with --hostfile.")
@click.option("--num_nodes",
type=int,
Expand Down
2 changes: 1 addition & 1 deletion colossalai/cli/launcher/hostinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def is_host_localhost(hostname: str, port: str = None) -> None:

# socket.getfqdn("127.0.0.1") does not return localhost
# on some users' machines
# thus, we directly return True if hostname is locahost, 127.0.0.1 or 0.0.0.0
# thus, we directly return True if hostname is localhost, 127.0.0.1 or 0.0.0.0
if hostname in ("localhost", "127.0.0.1", "0.0.0.0"):
return True

Expand Down
2 changes: 1 addition & 1 deletion colossalai/cli/launcher/multinode_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def recv_from_all(self) -> dict:
Receive messages from all hosts

Returns:
msg_from_node (dict): a dictionry which contains messages from each node
msg_from_node (dict): a dictionary which contains messages from each node
"""

msg_from_node = dict()
Expand Down
2 changes: 1 addition & 1 deletion colossalai/cli/launcher/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ def launch_multi_processes(args: Config) -> None:
# receive the stop status
msg_from_node = runner.recv_from_all()

# printe node status
# print node status
click.echo("\n====== Stopping All Nodes =====")
for hostname, msg in msg_from_node.items():
click.echo(f"{hostname}: {msg}")
Expand Down
2 changes: 1 addition & 1 deletion colossalai/device/alpha_beta_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def get_max_nbytes(process_group: Tuple[int], pg_handler: dist.ProcessGroup):
dist.broadcast_object_list(broadcast_list, src=process_group[0])
alpha_beta_dict[process_group] = tuple(broadcast_list)

# add symmetry pair to the apha_beta_dict
# add symmetry pair to the alpha_beta_dict
symmetry_ab_dict = {}
for process_group, alpha_beta_pair in alpha_beta_dict.items():
symmetry_process_group = (process_group[1], process_group[0])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def extract_kwargs_from_mod(self):

For example:
The kwargs for conv2d module is {} because the attributes like 'padding' or 'groups' are
considered during module initilizing. However, we need to consider those attributes as kwargs
considered during module initializing. However, we need to consider those attributes as kwargs
in F.conv2d.
"""
pass
Expand Down
2 changes: 1 addition & 1 deletion colossalai/fx/tracer/experimental.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ class PatchedCheckpointFunction(torch.autograd.Function):

@staticmethod
def forward(ctx, run_function, preserve_rng_state, *args):
# signal that the current tracing occurs within activaton checkpoint part
# signal that the current tracing occurs within activation checkpoint part
self.inside_torch_checkpoint_func = True
out = run_function(*args)
self.inside_torch_checkpoint_func = False
Expand Down
6 changes: 3 additions & 3 deletions colossalai/fx/tracer/tracer.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def create_proxy(self, kind, target, args, kwargs, name=None, type_expr=None, pr
return proxy

# if graph is traced for auto parallelism module, some extra node will be added during
# graph construction to deal with the compatability between bias addition and all reduce.
# graph construction to deal with the compatibility between bias addition and all reduce.

# if no extra manipulation is applied, we just pass the origin arguments to create_proxy function
# to create node on computation graph
Expand Down Expand Up @@ -208,7 +208,7 @@ def _configure_tracer_type(self, tracer_type: TracerType):
self.proxy_cls = ColoProxy
self.tracer_type = TracerType.META
else:
raise ValueError(f"Unrecognised tracer type {tracer_type}")
raise ValueError(f"Unrecognized tracer type {tracer_type}")

def _meta_data_computing(self, kind, target, args, kwargs):

Expand Down Expand Up @@ -445,7 +445,7 @@ class PatchedCheckpointFunction(torch.autograd.Function):

@staticmethod
def forward(ctx, run_function, preserve_rng_state, *args):
# signal that the current tracing occurs within activaton checkpoint part
# signal that the current tracing occurs within activation checkpoint part
self.inside_torch_checkpoint_func = True
out = run_function(*args)
self.inside_torch_checkpoint_func = False
Expand Down
2 changes: 1 addition & 1 deletion colossalai/kernel/cuda_native/flash_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def forward(self,
elif attn_mask_type == AttnMaskType.causal: # gpt style
attn_bias = LowerTriangularMask()

if bias is not None: # alibi / relative position emebedding
if bias is not None: # alibi / relative position embedding
assert allow_alibi, "flash attention with bias is not supported in this system."
assert attn_mask_type == AttnMaskType.causal, \
"attention with bias is only supported for causal attention so far."
Expand Down
2 changes: 1 addition & 1 deletion colossalai/kernel/cuda_native/multihead_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class Config:
attn_prob_dropout_ratio: float # attention score dropout ratio
hidden_dropout_ratio: float # dropout ration before residual
norm_first: bool # norm_first
fp16: bool # fp16 presion
fp16: bool # fp16 precision


class MultiHeadAttention1DFunc(Function):
Expand Down
2 changes: 1 addition & 1 deletion colossalai/kernel/jit/option.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def warmup_jit_fusion(batch_size: int,
seq_length: int = 512,
vocab_size: int = 32768,
dtype: torch.dtype = torch.float32):
""" Compilie JIT functions before the main training steps """
""" Compile JIT functions before the main training steps """

embed = Embedding(vocab_size, hidden_size).to(get_current_device())
linear_1 = Linear(hidden_size, hidden_size * 4, skip_bias_add=True).to(get_current_device())
Expand Down