-
Notifications
You must be signed in to change notification settings - Fork 3.8k
Description
I've started noticing a large performance regression affecting Keras MobileNetV2 caused by INDEX_DEFAULT_I64=ON (PR #6143). This is on an AWS m5.12xlarge instance.
| INDEX_DEFAULT_I64 setting | Frames per second |
|---|---|
| ON | 66.56 |
| OFF | 435.49 |
Profile with INDEX_DEFAULT_I64=OFF (fast)
Node Name Ops Time(us) Time(%) Shape Inputs Outputs
--------- --- -------- ------- ----- ------ -------
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_7 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_7 64.704 3.571 (1, 9, 56, 56, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_6 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_6 53.362 2.945 (1, 2, 112, 112, 16) 3 1
fused_nn_pad_3 fused_nn_pad_3 50.582 2.791 (1, 6, 113, 113, 16) 1 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_5 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_5 47.874 2.642 (1, 6, 56, 56, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_6 fused_nn_contrib_conv2d_NCHWc_add_clip_6 46.828 2.584 (1, 6, 112, 112, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_8 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_8 42.364 2.338 (1, 12, 28, 28, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_91 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_9 39.554 2.183 (1, 36, 14, 14, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_81 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_8 39.418 2.175 (1, 12, 28, 28, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_add_4 fused_nn_contrib_conv2d_NCHWc_add_add_4 38.871 2.145 (1, 2, 56, 56, 12) 4 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_9 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_9 37.926 2.093 (1, 36, 14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_5 fused_nn_contrib_conv2d_NCHWc_add_clip_5 37.407 2.064 (1, 9, 56, 56, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_51 fused_nn_contrib_conv2d_NCHWc_add_clip_5 35.349 1.951 (1, 9, 56, 56, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip fused_nn_contrib_conv2d_NCHWc_add_clip 34.692 1.915 (1, 80, 7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_6 fused_nn_contrib_conv2d_NCHWc_add_6 34.052 1.879 (1, 1, 112, 112, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add fused_nn_contrib_conv2d_NCHWc_add 33.58 1.853 (1, 20, 7, 7, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_21 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 33.298 1.838 (1, 24, 14, 14, 16) 3 1
fused_nn_pad_2 fused_nn_pad_2 33.201 1.832 (1, 9, 57, 57, 16) 1 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_22 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 33.057 1.824 (1, 24, 14, 14, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 33.027 1.823 (1, 24, 14, 14, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_23 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 32.787 1.809 (1, 24, 14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_5 fused_nn_contrib_conv2d_NCHWc_add_5 32.332 1.784 (1, 2, 56, 56, 12) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip 32.156 1.775 (1, 60, 7, 7, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip 31.68 1.748 (1, 60, 7, 7, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip2 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip 30.832 1.701 (1, 60, 7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_7 fused_nn_contrib_conv2d_NCHWc_add_clip_7 30.521 1.684 (1, 2, 112, 112, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_add_11 fused_nn_contrib_conv2d_NCHWc_add_add_1 30.012 1.656 (1, 6, 14, 14, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_add_1 fused_nn_contrib_conv2d_NCHWc_add_add_1 29.914 1.651 (1, 6, 14, 14, 16) 4 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_4 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_4 28.642 1.581 (1, 9, 28, 28, 16) 3 1
fused_nn_global_avg_pool2d fused_nn_global_avg_pool2d 28.552 1.576 (1, 80, 1, 1, 16) 1 1
fused_layout_transform_40 fused_layout_transform_40 26.741 1.476 (1, 8, 56, 56, 12) 1 1
fused_layout_transform_41 fused_layout_transform_41 25.793 1.423 (1, 12, 56, 56, 12) 1 1
fused_nn_contrib_conv2d_NCHWc_add_add1 fused_nn_contrib_conv2d_NCHWc_add_add 25.759 1.422 (1, 10, 7, 7, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_add_2 fused_nn_contrib_conv2d_NCHWc_add_add_2 25.566 1.411 (1, 4, 14, 14, 16) 4 1
fused_nn_dense_add fused_nn_dense_add 25.52 1.408 (1, 1000) 3 1
fused_nn_contrib_conv2d_NCHWc_add_add fused_nn_contrib_conv2d_NCHWc_add_add 25.391 1.401 (1, 10, 7, 7, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_clip_21 fused_nn_contrib_conv2d_NCHWc_add_clip_2 25.345 1.399 (1, 36, 14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_2 fused_nn_contrib_conv2d_NCHWc_add_clip_2 25.262 1.394 (1, 36, 14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_22 fused_nn_contrib_conv2d_NCHWc_add_clip_2 24.895 1.374 (1, 36, 14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_add_3 fused_nn_contrib_conv2d_NCHWc_add_add_3 24.679 1.362 (1, 2, 28, 28, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_add_31 fused_nn_contrib_conv2d_NCHWc_add_add_3 24.553 1.355 (1, 2, 28, 28, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_2 fused_nn_contrib_conv2d_NCHWc_add_2 23.364 1.289 (1, 6, 14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_add_21 fused_nn_contrib_conv2d_NCHWc_add_add_2 23.264 1.284 (1, 4, 14, 14, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_add_22 fused_nn_contrib_conv2d_NCHWc_add_add_2 23.006 1.27 (1, 4, 14, 14, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_clip_11 fused_nn_contrib_conv2d_NCHWc_add_clip_1 22.724 1.254 (1, 60, 7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_32 fused_nn_contrib_conv2d_NCHWc_add_clip_3 22.722 1.254 (1, 24, 14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_41 fused_nn_contrib_conv2d_NCHWc_add_clip_4 22.522 1.243 (1, 12, 28, 28, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_1 fused_nn_contrib_conv2d_NCHWc_add_clip_1 22.247 1.228 (1, 60, 7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_33 fused_nn_contrib_conv2d_NCHWc_add_clip_3 21.648 1.195 (1, 24, 14, 14, 16) 3 1
fused_nn_pad fused_nn_pad 21.439 1.183 (1, 36, 15, 15, 16) 1 1
fused_nn_contrib_conv2d_NCHWc_add_clip_12 fused_nn_contrib_conv2d_NCHWc_add_clip_1 21.437 1.183 (1, 60, 7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_4 fused_nn_contrib_conv2d_NCHWc_add_4 21.426 1.182 (1, 2, 28, 28, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_1 fused_nn_contrib_conv2d_NCHWc_add_1 21.227 1.171 (1, 10, 7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_31 fused_nn_contrib_conv2d_NCHWc_add_clip_3 20.739 1.145 (1, 24, 14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_3 fused_nn_contrib_conv2d_NCHWc_add_clip_3 20.719 1.143 (1, 24, 14, 14, 16) 3 1
fused_nn_softmax fused_nn_softmax 19.798 1.093 (1, 1000) 1 1
fused_nn_contrib_conv2d_NCHWc_add_clip_42 fused_nn_contrib_conv2d_NCHWc_add_clip_4 19.751 1.09 (1, 12, 28, 28, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_4 fused_nn_contrib_conv2d_NCHWc_add_clip_4 19.679 1.086 (1, 12, 28, 28, 16) 3 1
fused_nn_pad_1 fused_nn_pad_1 18.729 1.034 (1, 12, 29, 29, 16) 1 1
fused_nn_contrib_conv2d_NCHWc_add_3 fused_nn_contrib_conv2d_NCHWc_add_3 18.411 1.016 (1, 4, 14, 14, 16) 3 1
fused_nn_pad_layout_transform fused_nn_pad_layout_transform 18.159 1.002 (1, 1, 225, 225, 3) 1 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_3 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_3 15.938 0.88 (1, 12, 14, 14, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_1 15.438 0.852 (1, 36, 7, 7, 16) 3 1
fused_layout_transform_transpose_nn_batch_flatten fused_layout_transform_transpose_nn_batch_flatten 1.563 0.086 (1, 1280) 1 1
Total_time - 1812.033 - - - -
Profile with INDEX_DEFAULT_I64=ON (slow)
Node Name Ops Time(us) Time(%) Shape Inputs Outputs
--------- --- -------- ------- ----- ------ -------
fused_nn_contrib_conv2d_NCHWc_add_add_1 fused_nn_contrib_conv2d_NCHWc_add_add_1 3105.8 21.391 (1, 6, 14, 14, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_add_11 fused_nn_contrib_conv2d_NCHWc_add_add_1 3104.62 21.382 (1, 6, 14, 14, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_add_2 fused_nn_contrib_conv2d_NCHWc_add_add_2 2200.03 15.152 (1, 4, 14, 14, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_add_21 fused_nn_contrib_conv2d_NCHWc_add_add_2 2189.84 15.082 (1, 4, 14, 14, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_add_22 fused_nn_contrib_conv2d_NCHWc_add_add_2 2185.71 15.054 (1, 4, 14, 14, 16) 4 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_7 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_7 60.094 0.414 (1, 9, 56, 56, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_91 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_9 52.82 0.364 (1, 36, 14, 14, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_6 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_6 51.393 0.354 (1, 2, 112, 112, 16) 3 1
fused_nn_pad_3 fused_nn_pad_3 51.19 0.353 (1, 6, 113, 113, 16) 1 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_5 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_5 49.058 0.338 (1, 6, 56, 56, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_6 fused_nn_contrib_conv2d_NCHWc_add_clip_6 46.637 0.321 (1, 6, 112, 112, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 43.381 0.299 (1, 24, 14, 14, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_8 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_8 40.165 0.277 (1, 12, 28, 28, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_23 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 39.355 0.271 (1, 24, 14, 14, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_22 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 39.205 0.27 (1, 24, 14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_add_4 fused_nn_contrib_conv2d_NCHWc_add_add_4 38.595 0.266 (1, 2, 56, 56, 12) 4 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_9 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_9 38.019 0.262 (1, 36, 14, 14, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_81 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_8 37.559 0.259 (1, 12, 28, 28, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_5 fused_nn_contrib_conv2d_NCHWc_add_clip_5 36.159 0.249 (1, 9, 56, 56, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_51 fused_nn_contrib_conv2d_NCHWc_add_clip_5 35.269 0.243 (1, 9, 56, 56, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip fused_nn_contrib_conv2d_NCHWc_add_clip 34.755 0.239 (1, 80, 7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_2 fused_nn_contrib_conv2d_NCHWc_add_2 34.248 0.236 (1, 6, 14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_6 fused_nn_contrib_conv2d_NCHWc_add_6 33.65 0.232 (1, 1, 112, 112, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_7 fused_nn_contrib_conv2d_NCHWc_add_clip_7 33.163 0.228 (1, 2, 112, 112, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_21 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 32.593 0.224 (1, 24, 14, 14, 16) 3 1
fused_nn_pad_2 fused_nn_pad_2 32.542 0.224 (1, 9, 57, 57, 16) 1 1
fused_nn_contrib_conv2d_NCHWc_add fused_nn_contrib_conv2d_NCHWc_add 32.471 0.224 (1, 20, 7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_5 fused_nn_contrib_conv2d_NCHWc_add_5 31.587 0.218 (1, 2, 56, 56, 12) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip 30.659 0.211 (1, 60, 7, 7, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip 30.109 0.207 (1, 60, 7, 7, 16) 3 1
fused_nn_pad fused_nn_pad 29.258 0.202 (1, 36, 15, 15, 16) 1 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_4 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_4 29.083 0.2 (1, 9, 28, 28, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_2 fused_nn_contrib_conv2d_NCHWc_add_clip_2 28.273 0.195 (1, 36, 14, 14, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip2 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip 28.052 0.193 (1, 60, 7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_22 fused_nn_contrib_conv2d_NCHWc_add_clip_2 27.855 0.192 (1, 36, 14, 14, 16) 3 1
fused_layout_transform_40 fused_layout_transform_40 27.811 0.192 (1, 8, 56, 56, 12) 1 1
fused_nn_global_avg_pool2d fused_nn_global_avg_pool2d 27.724 0.191 (1, 80, 1, 1, 16) 1 1
fused_layout_transform_41 fused_layout_transform_41 27.308 0.188 (1, 12, 56, 56, 12) 1 1
fused_nn_dense_add fused_nn_dense_add 26.655 0.184 (1, 1000) 3 1
fused_nn_contrib_conv2d_NCHWc_add_1 fused_nn_contrib_conv2d_NCHWc_add_1 26.406 0.182 (1, 10, 7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_add fused_nn_contrib_conv2d_NCHWc_add_add 25.447 0.175 (1, 10, 7, 7, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_clip_21 fused_nn_contrib_conv2d_NCHWc_add_clip_2 25.433 0.175 (1, 36, 14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_add1 fused_nn_contrib_conv2d_NCHWc_add_add 25.276 0.174 (1, 10, 7, 7, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_clip_11 fused_nn_contrib_conv2d_NCHWc_add_clip_1 24.78 0.171 (1, 60, 7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_add_31 fused_nn_contrib_conv2d_NCHWc_add_add_3 24.132 0.166 (1, 2, 28, 28, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_clip_12 fused_nn_contrib_conv2d_NCHWc_add_clip_1 23.359 0.161 (1, 60, 7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_add_3 fused_nn_contrib_conv2d_NCHWc_add_add_3 23.226 0.16 (1, 2, 28, 28, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_clip_31 fused_nn_contrib_conv2d_NCHWc_add_clip_3 22.999 0.158 (1, 24, 14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_1 fused_nn_contrib_conv2d_NCHWc_add_clip_1 22.372 0.154 (1, 60, 7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_41 fused_nn_contrib_conv2d_NCHWc_add_clip_4 21.948 0.151 (1, 12, 28, 28, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_4 fused_nn_contrib_conv2d_NCHWc_add_4 21.359 0.147 (1, 2, 28, 28, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_1 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_1 21.269 0.146 (1, 36, 7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_33 fused_nn_contrib_conv2d_NCHWc_add_clip_3 20.916 0.144 (1, 24, 14, 14, 16) 3 1
fused_nn_softmax fused_nn_softmax 20.415 0.141 (1, 1000) 1 1
fused_nn_contrib_conv2d_NCHWc_add_clip_3 fused_nn_contrib_conv2d_NCHWc_add_clip_3 20.37 0.14 (1, 24, 14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_4 fused_nn_contrib_conv2d_NCHWc_add_clip_4 19.395 0.134 (1, 12, 28, 28, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_32 fused_nn_contrib_conv2d_NCHWc_add_clip_3 19.306 0.133 (1, 24, 14, 14, 16) 3 1
fused_nn_pad_1 fused_nn_pad_1 19.284 0.133 (1, 12, 29, 29, 16) 1 1
fused_nn_contrib_conv2d_NCHWc_add_clip_42 fused_nn_contrib_conv2d_NCHWc_add_clip_4 18.807 0.13 (1, 12, 28, 28, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_3 fused_nn_contrib_conv2d_NCHWc_add_3 17.728 0.122 (1, 4, 14, 14, 16) 3 1
fused_nn_pad_layout_transform fused_nn_pad_layout_transform 15.683 0.108 (1, 1, 225, 225, 3) 1 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_3 fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_3 15.236 0.105 (1, 12, 14, 14, 16) 3 1
fused_layout_transform_transpose_nn_batch_flatten fused_layout_transform_transpose_nn_batch_flatten 1.607 0.011 (1, 1280) 1 1
Total_time - 14519.449 - - - -
The slowdown comes from these ops:
fused_nn_contrib_conv2d_NCHWc_add_add_1 fused_nn_contrib_conv2d_NCHWc_add_add_1 3105.8 21.391 (1, 6, 14, 14, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_add_11 fused_nn_contrib_conv2d_NCHWc_add_add_1 3104.62 21.382 (1, 6, 14, 14, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_add_2 fused_nn_contrib_conv2d_NCHWc_add_add_2 2200.03 15.152 (1, 4, 14, 14, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_add_21 fused_nn_contrib_conv2d_NCHWc_add_add_2 2189.84 15.082 (1, 4, 14, 14, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_add_22 fused_nn_contrib_conv2d_NCHWc_add_add_2 2185.71 15.054 (1, 4, 14, 14, 16) 4 1
Here is a script to reproduce:
import time
import numpy as np
import tvm
from tvm import relay
from tvm.contrib import graph_runtime
import tensorflow as tf
input_shape = (1, 3, 224, 224)
model = tf.keras.applications.MobileNetV2()
mod, params = relay.frontend.from_keras(model, shape={'input_1': input_shape})
dtype = 'float32'
with relay.build_config(opt_level=3):
graph, lib, params = relay.build(mod, "llvm -mcpu=skylake-avx512", params=params)
i_data = np.random.uniform(0, 1, input_shape).astype(dtype)
mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
mod.set_input(**params)
# Time
times = []
for i in range(100):
start_time = time.time()
mod.run(input_1=i_data)
res = mod.get_output(0)
times.append(time.time() - start_time)
print('Mean latency:', 1000.0 * np.mean(times[10:]))
print('Mean FPS:', 1.0 / np.mean(times[10:]))
Thanks!
Metadata
Metadata
Assignees
Labels
No labels