Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion fastdeploy/model_executor/layers/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from paddle.distributed import fleet

from fastdeploy.config import FDConfig
from fastdeploy.model_executor.models.utils import set_weight_attrs
from fastdeploy.model_executor.utils import set_weight_attrs

from .utils import get_tensor

Expand Down
198 changes: 81 additions & 117 deletions fastdeploy/model_executor/layers/linear.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion fastdeploy/model_executor/layers/lm_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from paddle.distributed import fleet

from fastdeploy.config import FDConfig
from fastdeploy.model_executor.models.utils import set_weight_attrs
from fastdeploy.model_executor.utils import set_weight_attrs

from .utils import get_tensor

Expand Down
11 changes: 3 additions & 8 deletions fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import paddle
from paddle import nn

from fastdeploy.model_executor.layers.utils import set_weight_attrs
from fastdeploy.model_executor.utils import set_weight_attrs
from fastdeploy.platforms import current_platform

from ..quantization.quant_base import QuantMethodBase
Expand Down Expand Up @@ -185,9 +185,11 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
if current_platform.is_cuda():
self.up_gate_proj_weight_shape = [layer.num_experts, layer.hidden_size, layer.moe_intermediate_size * 2]
self.down_proj_weight_shape = [layer.num_experts, layer.moe_intermediate_size, layer.hidden_size]
extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 1, "down": 0, "up": 1}}
else:
self.up_gate_proj_weight_shape = [layer.num_experts, layer.moe_intermediate_size * 2, layer.hidden_size]
self.down_proj_weight_shape = [layer.num_experts, layer.hidden_size, layer.moe_intermediate_size]
extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}}

layer.up_gate_proj_weight = layer.create_parameter(
shape=self.up_gate_proj_weight_shape,
Expand All @@ -203,10 +205,3 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):

set_weight_attrs(layer.up_gate_proj_weight, extra_weight_attrs)
set_weight_attrs(layer.down_proj_weight, extra_weight_attrs)

if layer.moe_use_gate_correction_bias:
gate_correction_bias_shape = [1, layer.num_experts]
layer.gate_correction_bias = layer.create_parameter(
shape=gate_correction_bias_shape,
dtype="float32",
)
186 changes: 147 additions & 39 deletions fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
moe_expert_reduce,
)

from fastdeploy.model_executor.utils import TensorTracker, free_tensor, set_weight_attrs


# used for deepseek_v3
def get_moe_scores(
Expand Down Expand Up @@ -93,8 +95,8 @@ def compute_ffn(
return fastdeploy.model_executor.ops.iluvatar.moe_expert_ffn(
permute_input,
token_nums_per_expert,
layer.up_gate_proj_weight,
layer.down_proj_weight,
getattr(layer, self.added_weight_attrs[0]),
getattr(layer, self.added_weight_attrs[1]),
Comment on lines -96 to +99
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里希望保持原样,更清晰

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

但是量化前后名字不同

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

有可能会不一样因为 w4a8也继承这个方法 w4a8目前还没确定咋弄

None,
(layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None),
(layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None),
Expand All @@ -106,8 +108,8 @@ def compute_ffn(
return fastdeploy.model_executor.ops.gpu.moe_expert_ffn(
permute_input,
token_nums_per_expert,
layer.up_gate_proj_weight,
layer.down_proj_weight,
getattr(layer, self.added_weight_attrs[0]),
getattr(layer, self.added_weight_attrs[1]),
None,
(layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None),
(layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None),
Expand Down Expand Up @@ -392,12 +394,12 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
Paddle cutlass create weight process.
"""
self.weight_dtype = "int8"
self.ffn1_weight_shape = [
self.up_gate_proj_weight_shape = [
layer.num_local_experts,
layer.hidden_size // 2,
layer.moe_intermediate_size * 2,
]
self.ffn2_weight_shape = [
self.down_proj_weight_shape = [
layer.num_local_experts,
layer.moe_intermediate_size // 2,
layer.hidden_size,
Expand All @@ -406,7 +408,7 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
layer,
self.added_weight_attrs[0],
layer.create_parameter(
shape=self.ffn1_weight_shape,
shape=self.up_gate_proj_weight_shape,
dtype=self.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
Expand All @@ -415,7 +417,7 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
layer,
self.added_weight_attrs[1],
layer.create_parameter(
shape=self.ffn2_weight_shape,
shape=self.down_proj_weight_shape,
dtype=self.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
Expand Down Expand Up @@ -625,71 +627,177 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
Paddle cutlass create weight process.
"""
self.default_dtype = layer._helper.get_default_dtype()
self.weight_dtype = "int8"

up_gate_proj_weight_name = self.added_weight_attrs[0]
down_proj_weight_name = self.added_weight_attrs[1]
if self.moe_quant_type == "weight_only_int4":
self.ffn1_weight_shape = [
self.up_gate_proj_weight_shape = [
layer.num_local_experts,
layer.moe_intermediate_size,
layer.hidden_size,
]
else:
self.ffn1_weight_shape = [
self.up_gate_proj_weight_shape = [
layer.num_local_experts,
layer.moe_intermediate_size * 2,
layer.hidden_size,
]
if self.moe_quant_type == "weight_only_int4":
self.ffn2_weight_shape = [
self.down_proj_weight_shape = [
layer.num_local_experts,
layer.hidden_size // 2,
layer.moe_intermediate_size,
]
else:
self.ffn2_weight_shape = [
self.down_proj_weight_shape = [
layer.num_local_experts,
layer.hidden_size,
layer.moe_intermediate_size,
]
setattr(
layer,
up_gate_proj_weight_name,
layer.create_parameter(
shape=self.ffn1_weight_shape,
dtype=self.weight_dtype,
self.up_gate_proj_scale_shape = [layer.num_local_experts, layer.moe_intermediate_size * 2]
self.down_proj_scale_shape = [layer.num_local_experts, layer.hidden_size]

if layer.fd_config.load_config.load_choices == "default_v1":
layer.up_gate_proj_weight = layer.create_parameter(
shape=[layer.num_experts, layer.hidden_size, layer.moe_intermediate_size * 2],
dtype=layer.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
)
setattr(
layer,
down_proj_weight_name,
layer.create_parameter(
shape=self.ffn2_weight_shape,
dtype=self.weight_dtype,
)

layer.down_proj_weight = layer.create_parameter(
shape=[layer.num_experts, layer.moe_intermediate_size, layer.hidden_size],
dtype=layer.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
)
# weight_scale
)

set_weight_attrs(
layer.up_gate_proj_weight,
{
**extra_weight_attrs,
"tensor_track": TensorTracker(shape=layer.up_gate_proj_weight.shape, output_dim=True),
},
)
set_weight_attrs(
layer.down_proj_weight,
{
**extra_weight_attrs,
"tensor_track": TensorTracker(shape=layer.down_proj_weight.shape, output_dim=False),
},
)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

output_dim 这些标记,TP 并行、EP 并行都支持吗?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ep磁盘权重用不到 output_dim这个属性,tp支持

else:
self.weight_dtype = "int8"

up_gate_proj_weight_name = self.added_weight_attrs[0]
down_proj_weight_name = self.added_weight_attrs[1]
up_gate_proj_scale_name = self.added_scale_attrs[0]
down_proj_scale_name = self.added_scale_attrs[1]

setattr(
layer,
up_gate_proj_weight_name,
layer.create_parameter(
shape=self.up_gate_proj_weight_shape,
dtype=self.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
)
setattr(
layer,
down_proj_weight_name,
layer.create_parameter(
shape=self.down_proj_weight_shape,
dtype=self.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
)
# weight_scale
setattr(
layer,
up_gate_proj_scale_name,
layer.create_parameter(
shape=self.up_gate_proj_scale_shape,
dtype=self.default_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
)
setattr(
layer,
down_proj_scale_name,
layer.create_parameter(
shape=self.down_proj_scale_shape,
dtype=self.default_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
)

moe_extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}}
set_weight_attrs(layer.up_gate_proj_weight, moe_extra_weight_attrs)
set_weight_attrs(layer.down_proj_weight, moe_extra_weight_attrs)
scale_extra_weight_attrs = {
**extra_weight_attrs,
"SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "up": 0, "down": None},
}
set_weight_attrs(layer.up_gate_proj_weight_scale, scale_extra_weight_attrs)
set_weight_attrs(layer.down_proj_weight_scale, scale_extra_weight_attrs)

def process_weights_after_loading(self, layer):
""" """
if not layer.fd_config.load_config.load_choices == "default_v1":
return
weight_id_map = {"gate_up": 0, "down": 1}
if (
hasattr(layer.up_gate_proj_weight, "tensor_track")
and layer.up_gate_proj_weight.tensor_track is not None
and layer.up_gate_proj_weight.tensor_track.is_fully_copied()
):
weight_type = "gate_up"
else:
weight_type = "down"

# 1.init shape and type
# weight
weight_name = self.added_weight_attrs[weight_id_map[weight_type]]
unquantized_weight_name = weight_name.replace("quant_weight", "weight")
weight_shape = self.up_gate_proj_weight_shape if weight_type == "gate_up" else self.down_proj_weight_shape
weight_dtype = "int8"
# scale
scale_name = self.added_scale_attrs[weight_id_map[weight_type]]
scale_shape = self.up_gate_proj_scale_shape if weight_type == "gate_up" else self.down_proj_scale_shape
scale_dtype = self.default_dtype

# 2.crate tmp tensor

weight = paddle.empty(weight_shape, dtype=weight_dtype)
scale = paddle.empty(scale_shape, dtype=scale_dtype)

# 3.quantize weight

for expert_id in range(layer.num_experts):
weight[expert_id], scale[expert_id] = weight_quantize(
getattr(layer, unquantized_weight_name)[expert_id], algo=self.moe_quant_type
)

free_tensor(getattr(layer, unquantized_weight_name))

# create weight
setattr(
layer,
self.added_scale_attrs[0],
weight_name,
layer.create_parameter(
shape=[layer.num_local_experts, layer.moe_intermediate_size * 2],
dtype=self.default_dtype,
shape=weight_shape,
dtype=weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
)
# create scale
setattr(
layer,
self.added_scale_attrs[1],
scale_name,
layer.create_parameter(
shape=[layer.num_local_experts, layer.hidden_size],
dtype=self.default_dtype,
shape=scale_shape,
dtype=scale_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
)
getattr(layer, weight_name).copy_(weight, False)
getattr(layer, scale_name).copy_(scale, False)

def process_loaded_weights(self, layer: nn.Layer, state_dict):
"""
Expand Down
Loading
Loading