diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index ff4ba94b8cc..6c86c257115 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -11,6 +11,9 @@ on: paths: - .ci/docker/ci_commit_pins/pytorch.txt - .ci/scripts/** + - backends/arm/** + - examples/arm/** + - backends/cortex_m/** workflow_dispatch: concurrency: @@ -1051,6 +1054,7 @@ jobs: runs-on: ubuntu-latest outputs: qnn: ${{ steps.filter.outputs.qnn }} + arm: ${{ steps.filter.outputs.arm }} steps: - uses: actions/checkout@v4 - uses: dorny/paths-filter@v3 @@ -1061,6 +1065,10 @@ jobs: - 'backends/qualcomm/**' - 'examples/qualcomm/**' - 'examples/models/llama/**' + arm: + - 'backends/arm/**' + - 'examples/arm/**' + - 'backends/cortex_m/**' test-static-llama-qnn-eval-linux: needs: changes # has dependency on changes jobs defined above @@ -1135,6 +1143,8 @@ jobs: }" test-mcu-cortex-m-backend: + needs: changes + if: needs.changes.outputs.arm == 'true' name: test-mcu-cortex-m-backend uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main permissions: diff --git a/backends/cortex_m/ops/operators.py b/backends/cortex_m/ops/operators.py index b4a36bc7258..0bc9702d680 100644 --- a/backends/cortex_m/ops/operators.py +++ b/backends/cortex_m/ops/operators.py @@ -352,7 +352,7 @@ def quantized_linear_meta( activation_min, ) -> torch.Tensor: - shape = (*input.shape[:-1], weights.shape[1]) + shape = (*input.shape[:-1], weights.shape[0]) return torch.empty(shape, dtype=input.dtype, device=input.device) @@ -386,7 +386,7 @@ def quantized_linear_impl( input_reshaped = input_int32.reshape(new_shape) lhs_sum = torch.sum(input_reshaped, dim=-1, keepdim=True) * filter_offset - output = torch.mm(input_reshaped, weights_int32) + lhs_sum + kernel_sum + output = torch.mm(input_reshaped, weights_int32.T) + lhs_sum + kernel_sum output_shape = (*input.shape[:-1], output.shape[-1]) output_reshaped = output.reshape(output_shape) else: @@ -396,7 +396,7 @@ def quantized_linear_impl( new_shape = (prod(input.shape[:-1]), input.shape[-1]) input_reshaped = input_int32.reshape(new_shape) - output = torch.mm(input_reshaped, weights_int32) + output = torch.mm(input_reshaped, weights_int32.T) if bias is not None: output = output + bias output_shape = (*input.shape[:-1], output.shape[-1]) diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py index 95c10369009..8da0e720036 100644 --- a/backends/cortex_m/passes/convert_to_cortex_m_pass.py +++ b/backends/cortex_m/passes/convert_to_cortex_m_pass.py @@ -33,19 +33,14 @@ class ConvertToCortexMPass(XNNPACKPass): by call_operator. """ - def _compute_kernel_sum( - self, weights_transposed, bias, input_offset, weight_offset - ): + def _compute_kernel_sum(self, weights, bias, input_offset, weight_offset): """ Computes the precomputed kernel sum term (bias optional) a * sum_j(wij + b) + ci for i = (1, ..., n), where j indexes the input activations. - - Args: - weights_transposed: Weights already in [in_features, out_features] format """ - # No transpose needed - weights already transposed by caller + weights_transposed = weights.T weights_int32 = weights_transposed.to(torch.int32) offset_weights = weights_int32 + weight_offset kernel_sum = torch.sum(offset_weights, dim=0, keepdim=True, dtype=torch.int32) @@ -115,12 +110,8 @@ def _get_linear_replacement(self, node): if len(node.args) > 2 else None ) - # Transpose weights once from PyTorch format [out_features, in_features] - # to CMSIS-NN format [in_features, out_features] - weights_transposed = weights_tensor.T.contiguous() - # Pass already-transposed weights to kernel_sum computation kernel_sum_tensor = self._compute_kernel_sum( - weights_transposed, bias_tensor, -input_zp, -weight_zp + weights_tensor, bias_tensor, -input_zp, -weight_zp ) with node.graph.inserting_after(weights): kernel_sum = create_constant_placeholder( @@ -131,17 +122,9 @@ def _get_linear_replacement(self, node): kernel_sum_tensor, ) - weights_transposed_node = create_constant_placeholder( - self.exported_program, - node.graph, - node.name + "_weights_transposed", - InputKind.PARAMETER, - weights_transposed, - ) - args = ( node.args[0], - weights_transposed_node, + weights, None, kernel_sum, -input_zp,