Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .github/workflows/trunk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ on:
paths:
- .ci/docker/ci_commit_pins/pytorch.txt
- .ci/scripts/**
- backends/arm/**
- examples/arm/**
- backends/cortex_m/**
workflow_dispatch:

concurrency:
Expand Down Expand Up @@ -1051,6 +1054,7 @@ jobs:
runs-on: ubuntu-latest
outputs:
qnn: ${{ steps.filter.outputs.qnn }}
arm: ${{ steps.filter.outputs.arm }}
steps:
- uses: actions/checkout@v4
- uses: dorny/paths-filter@v3
Expand All @@ -1061,6 +1065,10 @@ jobs:
- 'backends/qualcomm/**'
- 'examples/qualcomm/**'
- 'examples/models/llama/**'
arm:
- 'backends/arm/**'
- 'examples/arm/**'
- 'backends/cortex_m/**'

test-static-llama-qnn-eval-linux:
needs: changes # has dependency on changes jobs defined above
Expand Down Expand Up @@ -1135,6 +1143,8 @@ jobs:
}"

test-mcu-cortex-m-backend:
needs: changes
if: needs.changes.outputs.arm == 'true'
name: test-mcu-cortex-m-backend
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
Expand Down
6 changes: 3 additions & 3 deletions backends/cortex_m/ops/operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ def quantized_linear_meta(
activation_min,
) -> torch.Tensor:

shape = (*input.shape[:-1], weights.shape[1])
shape = (*input.shape[:-1], weights.shape[0])
return torch.empty(shape, dtype=input.dtype, device=input.device)


Expand Down Expand Up @@ -386,7 +386,7 @@ def quantized_linear_impl(
input_reshaped = input_int32.reshape(new_shape)

lhs_sum = torch.sum(input_reshaped, dim=-1, keepdim=True) * filter_offset
output = torch.mm(input_reshaped, weights_int32) + lhs_sum + kernel_sum
output = torch.mm(input_reshaped, weights_int32.T) + lhs_sum + kernel_sum
output_shape = (*input.shape[:-1], output.shape[-1])
output_reshaped = output.reshape(output_shape)
else:
Expand All @@ -396,7 +396,7 @@ def quantized_linear_impl(
new_shape = (prod(input.shape[:-1]), input.shape[-1])
input_reshaped = input_int32.reshape(new_shape)

output = torch.mm(input_reshaped, weights_int32)
output = torch.mm(input_reshaped, weights_int32.T)
if bias is not None:
output = output + bias
output_shape = (*input.shape[:-1], output.shape[-1])
Expand Down
25 changes: 4 additions & 21 deletions backends/cortex_m/passes/convert_to_cortex_m_pass.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,19 +33,14 @@ class ConvertToCortexMPass(XNNPACKPass):
by call_operator.
"""

def _compute_kernel_sum(
self, weights_transposed, bias, input_offset, weight_offset
):
def _compute_kernel_sum(self, weights, bias, input_offset, weight_offset):
"""
Computes the precomputed kernel sum term (bias optional)
a * sum_j(wij + b) + ci

for i = (1, ..., n), where j indexes the input activations.

Args:
weights_transposed: Weights already in [in_features, out_features] format
"""
# No transpose needed - weights already transposed by caller
weights_transposed = weights.T
weights_int32 = weights_transposed.to(torch.int32)
offset_weights = weights_int32 + weight_offset
kernel_sum = torch.sum(offset_weights, dim=0, keepdim=True, dtype=torch.int32)
Expand Down Expand Up @@ -115,12 +110,8 @@ def _get_linear_replacement(self, node):
if len(node.args) > 2
else None
)
# Transpose weights once from PyTorch format [out_features, in_features]
# to CMSIS-NN format [in_features, out_features]
weights_transposed = weights_tensor.T.contiguous()
# Pass already-transposed weights to kernel_sum computation
kernel_sum_tensor = self._compute_kernel_sum(
weights_transposed, bias_tensor, -input_zp, -weight_zp
weights_tensor, bias_tensor, -input_zp, -weight_zp
)
with node.graph.inserting_after(weights):
kernel_sum = create_constant_placeholder(
Expand All @@ -131,17 +122,9 @@ def _get_linear_replacement(self, node):
kernel_sum_tensor,
)

weights_transposed_node = create_constant_placeholder(
self.exported_program,
node.graph,
node.name + "_weights_transposed",
InputKind.PARAMETER,
weights_transposed,
)

args = (
node.args[0],
weights_transposed_node,
weights,
None,
kernel_sum,
-input_zp,
Expand Down