From f8b508181ab6c1c02f5784578d57a420a585ee9e Mon Sep 17 00:00:00 2001
From: Richard Winterton <rrwinterton@gmail.com>
Date: Fri, 3 May 2019 10:01:33 -0700
Subject: [PATCH 1/6] simdsignunsignedextendedload.md

proposes 8, 16, 32 signed and unsigned extended load instructions.
---
 .../simd/simdsignandunsignedextendedloads.md  | 64 +++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 proposals/simd/simdsignandunsignedextendedloads.md

diff --git a/proposals/simd/simdsignandunsignedextendedloads.md b/proposals/simd/simdsignandunsignedextendedloads.md
new file mode 100644
index 000000000..6361273ee
--- /dev/null
+++ b/proposals/simd/simdsignandunsignedextendedloads.md
@@ -0,0 +1,64 @@
+### **Proposal WebAssembly SIMD Modification**
+
+Currently as proposed there is an instructions defined in the WASM SIMD ISA as follows.
+
+**i8x16.mul** which is a register to register operation that takes 16 8 bit integers and
+
+multiplies them together resulting in an 8 bit value. If the distribution of the integers it flat this
+
+would result in a large percent of the instructions with overflow. This is a problem for many applications.
+
+
+### Proposed new instructions
+
+Six new load instructions are being proposed to make integer multiplies easier. i8x16zxload, i8x16sxload, i16x8zxload, i16x8sxload, i32x4zxload, i32x4sxload. This would make i8, i16, i32 multiplies useful and more practical for applications such as machine learning, image compression and video and rendering data processing.The new instructions would take consecutive integers of the corresponding size and zero sign extend and sign extend the consecutive bytes, words or dword to the promoted size of signed or unsigned data. An example of zero sign extend is shown below:  Intel and ARM both have this capability by doing the following:
+
+Intel Instructions:
+
+
+
+*   movzxbw
+*   movzxwd
+*   movzxdq
+*   movsxbw
+*   movsxwd
+*   Movsxdq
+
+ARM Instructions:
+
+
+
+*   LDR X0, [X1] Load from the address in X1
+*   LDR X0, [X1, #8] Load from address X1 + 8
+*   LDR X0, [X1, X2] Load from address X1 + X2
+*   LDR X0, [X1, X2, LSL, #3] Load from address X1 + (X2 << 3)
+*   LDR X0, [X1, W2, SXTW] Load from address X1 + sign extend(W2)
+*   LDR X0, [X1, W2, SXTW, #3] Load from address X1 + (sign extend(W2) << 3)
+
+So the new instructions for WASM would be defined as follows:
+
+
+
+*   i8x8.zxload
+*   i16x4.zxload
+*   i32x2.zxload
+*   i8x8.sxload
+*   i16x4.sxload
+*   i32x2.sxload
+
+As a result of these new instructions a multiply can now be done without worrying about signed
+
+and unsigned overflow on the data it operates on.
+
+The following is a partial sample example of how sign extended loads are be used in a matrix multiply of 8 bit integers:
+
+
+```
+       "pmovzxbw 0x00(%[mem]), %%xmm0\n\t"
+       "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
+       "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
+       "pmaddwd %%xmm0, %%xmm2         \n\t"
+       "pmaddwd %%xmm0, %%xmm3         \n\t"
+       "paddd %%xmm2, %%xmm4           \n\t"
+       "paddd %%xmm3, %%xmm5           \n\t"
+

From 685e1eb33c818df8f8a034dffc8bf043f0f754a0 Mon Sep 17 00:00:00 2001
From: Petr Penzin <petr.penzin@intel.com>
Date: Wed, 17 Jul 2019 10:02:05 -0700
Subject: [PATCH 2/6] Add extended load definitions to SIMD.md

---
 proposals/simd/SIMD.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/proposals/simd/SIMD.md b/proposals/simd/SIMD.md
index 4862364e8..94806c4ec 100644
--- a/proposals/simd/SIMD.md
+++ b/proposals/simd/SIMD.md
@@ -666,6 +666,13 @@ natural alignment.
 
 Load a `v128` vector from the given heap address.
 
+Extended loads:
+
+* `i8x8.zxload(memarg) -> v128`: load eight 8-bit integers and zero extend each one to a 16-bit lane
+* `i8x8.sxload(memarg) -> v128`: load eight 8-bit integers and sign extend each one to a 16-bit lane
+* `i16x4.zxload(memarg) -> v128`: load four 16-bit integers and zero extend each one to a 32-bit lane
+* `i16x4.sxload(memarg) -> v128`: load four 16-bit integers and sign extend each one to a 32-bit lane
+
 ### Store
 
 * `v128.store(memarg, data: v128)`

From 6863313278f91c0341c7fc6651369b1417f62e6e Mon Sep 17 00:00:00 2001
From: Petr Penzin <petr.penzin@intel.com>
Date: Wed, 17 Jul 2019 10:34:25 -0700
Subject: [PATCH 3/6] Add extended load definitions to BinarySIMD.md

---
 proposals/simd/BinarySIMD.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/proposals/simd/BinarySIMD.md b/proposals/simd/BinarySIMD.md
index ff51a7f0f..53575c1c7 100644
--- a/proposals/simd/BinarySIMD.md
+++ b/proposals/simd/BinarySIMD.md
@@ -166,5 +166,9 @@ The `v8x16.shuffle2_imm` instruction has 16 bytes after `simdop`.
 | `f32x4.convert_u/i32x4`   |    `0xb0`| -                  |
 | `f64x2.convert_s/i64x2`   |    `0xb1`| -                  |
 | `f64x2.convert_u/i64x2`   |    `0xb2`| -                  |
+| `i8x8.zxload`             |    `0xb3`| m:memarg           |
+| `i8x8.sxload`             |    `0xb4`| m:memarg           |
+| `i16x4.zxload`            |    `0xb5`| m:memarg           |
+| `i16x4.sxload`            |    `0xb6`| m:memarg           |
 | `v8x16.shuffle1`          |    `0xc0`| -                  |
-| `v8x16.shuffle2_imm`      |    `0xc1`| s:LaneIdx32[16]    |
\ No newline at end of file
+| `v8x16.shuffle2_imm`      |    `0xc1`| s:LaneIdx32[16]    |

From c932ed969b049be562cb3dbcf066686a22dc66db Mon Sep 17 00:00:00 2001
From: Petr Penzin <petr.penzin@intel.com>
Date: Wed, 17 Jul 2019 10:37:38 -0700
Subject: [PATCH 4/6] Create docs directory

---
 .../SIMD-sign-and-zero-extended-loads.md}           |   0
 .../simd/{ => docs}/WebAssembly-SIMD-May-2017.pdf   | Bin
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename proposals/simd/{simdsignandunsignedextendedloads.md => docs/SIMD-sign-and-zero-extended-loads.md} (100%)
 rename proposals/simd/{ => docs}/WebAssembly-SIMD-May-2017.pdf (100%)

diff --git a/proposals/simd/simdsignandunsignedextendedloads.md b/proposals/simd/docs/SIMD-sign-and-zero-extended-loads.md
similarity index 100%
rename from proposals/simd/simdsignandunsignedextendedloads.md
rename to proposals/simd/docs/SIMD-sign-and-zero-extended-loads.md
diff --git a/proposals/simd/WebAssembly-SIMD-May-2017.pdf b/proposals/simd/docs/WebAssembly-SIMD-May-2017.pdf
similarity index 100%
rename from proposals/simd/WebAssembly-SIMD-May-2017.pdf
rename to proposals/simd/docs/WebAssembly-SIMD-May-2017.pdf

From fd955ac4f42e7b602ef79ef359a0ec8d79c7a277 Mon Sep 17 00:00:00 2001
From: Petr Penzin <petr.penzin@intel.com>
Date: Thu, 18 Jul 2019 14:49:44 -0700
Subject: [PATCH 5/6] Implementaiton status for extended loads

---
 proposals/simd/ImplementationStatus.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/proposals/simd/ImplementationStatus.md b/proposals/simd/ImplementationStatus.md
index 7d612fee7..1b050e7e4 100644
--- a/proposals/simd/ImplementationStatus.md
+++ b/proposals/simd/ImplementationStatus.md
@@ -139,6 +139,10 @@
 | `f32x4.convert_u/i32x4`   |               `-msimd128` | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
 | `f64x2.convert_s/i64x2`   | `-munimplemented-simd128` |                    | :heavy_check_mark: | :heavy_check_mark: |
 | `f64x2.convert_u/i64x2`   | `-munimplemented-simd128` |                    | :heavy_check_mark: | :heavy_check_mark: |
+| `i8x8.zxload`             |                           |                    |                    |                    |
+| `i8x8.sxload`             |                           |                    |                    |                    |
+| `i16x4.zxload`            |                           |                    |                    |                    |
+| `i16x4.sxload`            |                           |                    |                    |                    |
 | `v8x16.shuffle1`          |                           |                    | :heavy_check_mark: |                    |
 | `v8x16.shuffle2_imm`      |                           |                    | :heavy_check_mark: | :heavy_check_mark: |
 

From 429bacf7ec6f8a8af5ee17beaf9c0c3caec92abc Mon Sep 17 00:00:00 2001
From: Petr Penzin <petr.penzin@intel.com>
Date: Tue, 23 Jul 2019 14:29:39 -0700
Subject: [PATCH 6/6] Extended loads: 32->64 bit ops

---
 proposals/simd/BinarySIMD.md           | 2 ++
 proposals/simd/ImplementationStatus.md | 2 ++
 proposals/simd/SIMD.md                 | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/proposals/simd/BinarySIMD.md b/proposals/simd/BinarySIMD.md
index 53575c1c7..b0a769682 100644
--- a/proposals/simd/BinarySIMD.md
+++ b/proposals/simd/BinarySIMD.md
@@ -170,5 +170,7 @@ The `v8x16.shuffle2_imm` instruction has 16 bytes after `simdop`.
 | `i8x8.sxload`             |    `0xb4`| m:memarg           |
 | `i16x4.zxload`            |    `0xb5`| m:memarg           |
 | `i16x4.sxload`            |    `0xb6`| m:memarg           |
+| `i32x2.zxload`            |    `0xb7`| m:memarg           |
+| `i32x2.sxload`            |    `0xb8`| m:memarg           |
 | `v8x16.shuffle1`          |    `0xc0`| -                  |
 | `v8x16.shuffle2_imm`      |    `0xc1`| s:LaneIdx32[16]    |
diff --git a/proposals/simd/ImplementationStatus.md b/proposals/simd/ImplementationStatus.md
index 1b050e7e4..fb2a3ac30 100644
--- a/proposals/simd/ImplementationStatus.md
+++ b/proposals/simd/ImplementationStatus.md
@@ -143,6 +143,8 @@
 | `i8x8.sxload`             |                           |                    |                    |                    |
 | `i16x4.zxload`            |                           |                    |                    |                    |
 | `i16x4.sxload`            |                           |                    |                    |                    |
+| `i32x2.zxload`            |                           |                    |                    |                    |
+| `i32x2.sxload`            |                           |                    |                    |                    |
 | `v8x16.shuffle1`          |                           |                    | :heavy_check_mark: |                    |
 | `v8x16.shuffle2_imm`      |                           |                    | :heavy_check_mark: | :heavy_check_mark: |
 
diff --git a/proposals/simd/SIMD.md b/proposals/simd/SIMD.md
index 94806c4ec..40e4eabd3 100644
--- a/proposals/simd/SIMD.md
+++ b/proposals/simd/SIMD.md
@@ -672,6 +672,8 @@ Extended loads:
 * `i8x8.sxload(memarg) -> v128`: load eight 8-bit integers and sign extend each one to a 16-bit lane
 * `i16x4.zxload(memarg) -> v128`: load four 16-bit integers and zero extend each one to a 32-bit lane
 * `i16x4.sxload(memarg) -> v128`: load four 16-bit integers and sign extend each one to a 32-bit lane
+* `i32x2.zxload(memarg) -> v128`: load two 32-bit integers and zero extend each one to a 64-bit lane
+* `i32x2.sxload(memarg) -> v128`: load two 32-bit integers and sign extend each one to a 64-bit lane
 
 ### Store